From 10d1ddb0cfc0d1c1c6779cd9d4dfd489ea833c9a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 3 Jul 2024 13:57:51 +0200 Subject: [PATCH 01/73] Initial draft. Rebased. Dynamic shapes WIP BF16 support End-to-end compression WIP Add logic to compare numpy to ov computations Added release_memory Added a script to run multiple experiments sequentially INT4 experiments BF16 fix INT4 performance gains WIP weight_lowering dir created weight_lowering_ dir changes Renamed Something works File updates Removed dispatching --- docs/api/source/conf.py | 1 + nncf/common/logging/logger.py | 6 + nncf/openvino/graph/node_utils.py | 6 +- .../algorithms/weight_compression/config.py | 3 + .../weight_compression/openvino_backend.py | 94 ++--- .../weight_compression/openvino_modeling.py | 345 ++++++++++++++++ .../weight_compression/scale_estimation.py | 34 +- .../weight_compression/weight_lowering.py | 183 +++++---- nncf/quantization/fake_quantize.py | 19 +- nncf/tensor/definitions.py | 1 + nncf/tensor/functions/__init__.py | 3 + nncf/tensor/functions/ov.py | 41 ++ nncf/utils.py | 32 ++ run_weight_compression.py | 373 ++++++++++++++++++ .../quantization/test_weights_compression.py | 6 +- weight_compression.py | 234 +++++++++++ 16 files changed, 1214 insertions(+), 167 deletions(-) create mode 100644 nncf/quantization/algorithms/weight_compression/openvino_modeling.py create mode 100644 nncf/tensor/functions/ov.py create mode 100644 nncf/utils.py create mode 100644 run_weight_compression.py create mode 100644 weight_compression.py diff --git a/docs/api/source/conf.py b/docs/api/source/conf.py index fe3afe0525c..ca5b7a11e0f 100644 --- a/docs/api/source/conf.py +++ b/docs/api/source/conf.py @@ -142,6 +142,7 @@ def collect_api_entities() -> APIInfo: "nncf.tensor.functions.numpy_linalg", "nncf.tensor.functions.torch_numeric", "nncf.tensor.functions.torch_linalg", + "nncf.tensor.functions.ov", ] with mock(mock_modules): diff --git a/nncf/common/logging/logger.py b/nncf/common/logging/logger.py index 5ba4b9a257c..e13fcaa8442 100644 --- a/nncf/common/logging/logger.py +++ b/nncf/common/logging/logger.py @@ -12,6 +12,7 @@ import logging import sys from contextlib import contextmanager +from functools import lru_cache NNCF_LOGGER_NAME = "nncf" @@ -86,3 +87,8 @@ def warn_bkc_version_mismatch(backend: str, bkc_version: str, current_version: s f"while current {backend} version is {current_version}. " f"If you encounter issues, consider switching to {backend}{bkc_version}" ) + + +@lru_cache(None) +def log_once(level, message): + nncf_logger.log(level, message) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 7496187adb1..8fab3933945 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -8,7 +8,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from typing import Any, Callable, Dict, List, Optional, Tuple, Type import numpy as np @@ -115,7 +115,9 @@ def get_const_value(const_node: ov.Node) -> np.ndarray: :param const_node: OpenVINO node. :return: The constant value. """ - if const_node.get_element_type() == ov.Type.bf16: + INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32") + NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + if const_node.get_element_type() == ov.Type.bf16 and (INPUT_DTYPE != "bf16" or NUMPY_COMPRESSION): # Fixed FP32 data type as the result for BF16 constant return const_node.get_data(dtype=np.float32) return const_node.data diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 56dbc24f2e2..ce512331349 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -46,6 +46,9 @@ def is_integer(self): """ return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + def __hash__(self): + return hash((self.mode.value, self.group_size)) + @dataclass class WeightCompressionParameters: diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 3d17d1a6af4..c00cb82a3f2 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -244,21 +244,38 @@ def _create_compression_subgraph( original_shape = weight.shape compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points) - compressed_const = opset.constant(compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name) + compressed_weight_data = compressed_weight.tensor.data + if isinstance(compressed_weight_data, ov.Tensor): + compressed_const = opset.constant(compressed_weight_data, name=const_node_name) + else: + compressed_const = opset.constant(compressed_weight_data, dtype=compression_dtype, name=const_node_name) + if compressed_const.get_element_type() != compression_dtype: + compressed_const = opset.convert(compressed_const, compression_dtype) converted_const = opset.convert(compressed_const, ov.Type.f16) - if compressed_weight.zero_point is not None and compressed_weight.tensor.dtype == TensorDataType.uint8: - zero_point_const = opset.constant( - compressed_weight.zero_point.data, - dtype=compression_dtype, - name=f"{const_node_name}/zero_point", - ) - converted_zero_point = opset.convert(zero_point_const, ov.Type.f16) + if compressed_weight.zero_point is not None: + zero_point_data = compressed_weight.zero_point.data + if isinstance(zero_point_data, ov.Tensor): + zero_point_const = opset.constant( + compressed_weight.zero_point.data, + name=f"{const_node_name}/zero_point", + ) + else: + zero_point_const = opset.constant( + compressed_weight.zero_point.data, + dtype=compression_dtype, + name=f"{const_node_name}/zero_point", + ) + zero_point_const = opset.convert(zero_point_const, ov.Type.f16) converted_const = opset.subtract( - converted_const, converted_zero_point, name=f"{const_node_name}/zero_point/subtract" + converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" ) - scale_const = opset.constant(compressed_weight.scale.data, dtype=scale_dtype, name=f"{const_node_name}/scale") - if scale_dtype != ov.Type.f16: + scale_data = compressed_weight.scale.data + if isinstance(scale_data, ov.Tensor): + scale_const = opset.constant(scale_data, name=f"{const_node_name}/scale") + else: + scale_const = opset.constant(scale_data, dtype=scale_dtype, name=f"{const_node_name}/scale") + if scale_const.get_element_type() != ov.Type.f16: scale_const = opset.convert(scale_const, ov.Type.f16) mul = opset.multiply( @@ -302,6 +319,9 @@ def transform_model( layer_zero_points = ( None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name) ) + import os + + os.environ["CURRENT_NODE_NAME"] = wc_params.weight_name mul, compressed_weight = self._create_compression_subgraph( weight=weight, compression_config=wc_params.compression_config, @@ -333,58 +353,6 @@ def dump_parameters( ) -> None: dump_parameters(model, parameters, algo_name, path) - @staticmethod - def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None): - parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline( - config, w_shape, s_shape, z_p_shape, True - ) - - if len(parameters) == 3: - _, s, zp = parameters - result = (clamp - zp) * s - else: - s = parameters[1] - result = clamp * s - - model = ov.Model([result], parameters) - - compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: Type.f32}) - - return lambda parameters: compiled_model(parameters)[0] - - @staticmethod - def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False): - mode = config.mode - assert mode in [ - CompressWeightsMode.INT4_SYM, - CompressWeightsMode.INT4_ASYM, - ], f"Only int4 supported, but given={mode}" - num_bits = config.num_bits - - asym_quant = mode in [CompressWeightsMode.INT4_ASYM] - level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) - level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - - w = opset.parameter(w_shape, name="w") - s = opset.parameter(s_shape, name="s") - parameters = [w, s] - compressed_w = w / s - if z_p_shape is not None: - zp = opset.parameter(z_p_shape, name="zp") - parameters.append(zp) - compressed_w += zp - - result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") - - if return_nodes: - return parameters, result - - model = ov.Model([result], parameters) - - compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: Type.f32}) - - return lambda parameters: compiled_model(parameters)[0] - class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): @staticmethod diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py new file mode 100644 index 00000000000..b4443970e30 --- /dev/null +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -0,0 +1,345 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import numpy as np +import openvino as ov +from openvino.runtime import opset13 as opset + +import nncf +from nncf import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig + + +@dataclass +class OVModelParameters: + dynamic: bool = False + recompile: bool = False + release_memory: bool = True + share_outputs: bool = True + input_dtype: str = "fp32" + + def __hash__(self): + return hash((self.dynamic, self.recompile, self.release_memory, self.share_outputs, self.input_dtype)) + + +class CompiledModelCache: + def __init__(self): + self._cache = {} + + def clear(self): + self._cache.clear() + + def is_empty(self): + return len(self._cache) == 0 + + +COMPILED_MODEL_CACHE = CompiledModelCache() + + +def clear_cache(): + COMPILED_MODEL_CACHE.clear() + + +def cache_results(func): + def wrapper(*args, **kwargs): + sig = inspect.signature(func) + new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} + new_kwargs.update(kwargs) + cache_key = (func.__name__, frozenset(new_kwargs.items())) + recompile = new_kwargs.get("ov_model_params", OVModelParameters()).recompile + cache = COMPILED_MODEL_CACHE._cache + if not recompile and cache_key in cache: + return cache[cache_key] + result = func(*args, **kwargs) + cache[cache_key] = result + return result + + return wrapper + + +@cache_results +def get_compress_weight_model( + config: WeightCompressionConfig, + weight_shape: Tuple, + scale_shape: Optional[Tuple] = None, + zero_point_shape: Optional[Tuple] = None, + reduction_axes: Optional[Tuple] = None, + ov_model_params: Optional[OVModelParameters] = None, +): + if scale_shape is None and zero_point_shape is not None: + raise Exception("Zero point shape can only be provided if scale shape is provided.") + # if (scale_shape is None) != (reduction_axes is not None): + # raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.") + + if ov_model_params.dynamic: + weight_shape = (-1,) * len(weight_shape) + if scale_shape is not None: + scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) + if zero_point_shape is not None: + zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + + return _build_compress_model( + config, + ov_model_params, + weight_shape, + scale_shape, + zero_point_shape, + reduction_axes, + return_nodes=False, + ) + + +@cache_results +def get_compress_decompress_weight_model( + config: WeightCompressionConfig, + weight_shape: Tuple, + scale_shape: Optional[Tuple], + zero_point_shape: Optional[Tuple] = None, + ov_model_params: Optional[OVModelParameters] = None, +): + if ov_model_params is None: + ov_model_params = OVModelParameters() + if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: + ov_model_params.dynamic = False + + if ov_model_params.dynamic: + weight_shape = (-1,) * len(weight_shape) + scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) + if zero_point_shape is not None: + zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + + return _build_compress_decompress_model( + config, + ov_model_params, + weight_shape, + scale_shape, + zero_point_shape, + ) + + +def _build_compress_decompress_model( + config: WeightCompressionConfig, + ov_model_params: OVModelParameters, + weight_shape: Tuple, + scale_shape: Tuple, + zero_point_shape: Optional[Tuple] = None, +): + ov_parameters, ov_results = _build_compress_model( + config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True + ) + return _get_compress_decompress_model( + config, + ov_model_params, + ov_parameters, + ov_results, + ) + + +def _build_compress_model( + config: WeightCompressionConfig, + ov_model_params: OVModelParameters, + weight_shape: Tuple, + scale_shape: Optional[Tuple] = None, + zero_point_shape: Optional[Tuple] = None, + reduction_axes: Optional[Tuple] = None, + return_nodes: bool = False, +): + if ov_model_params.input_dtype == "fp32": + input_dtype = ov.Type.f32 + elif ov_model_params.input_dtype == "fp16": + input_dtype = ov.Type.f16 + elif ov_model_params.input_dtype == "bf16": + input_dtype = ov.Type.bf16 + else: + raise Exception + weight = opset.parameter(weight_shape, name="w", dtype=input_dtype) + ov_parameters = [weight] + + if scale_shape is not None: + # Compute only the compressed weight + + scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32) + ov_parameters.append(scale) + + zero_point = None + if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: + zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) + ov_parameters.append(zero_point) + else: + # Compute compressed weight, scale and, possibly, zero point + + group_size = config.group_size + if group_size != -1: + if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: + reduction_axes = reduction_axes[0] + if not isinstance(reduction_axes, int): + raise NotImplementedError( + f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." + ) + channel_size = weight.shape[reduction_axes] + if channel_size % group_size != 0: + raise nncf.ValidationError( + f"Channel size {channel_size} should be divisible by size of group {group_size}" + ) + + num_groups_per_channel = channel_size // group_size + shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis + shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) + weight = opset.reshape(weight, shape, special_zero=False) + reduction_axes += 1 + + mode = config.mode + num_bits = config.num_bits + eps = np.finfo(np.float32).eps + if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + min_values = opset.reduce_min( + weight, reduction_axes=reduction_axes, keep_dims=True + ) # [a1, r, a2] -> [a1, 1, a2] + max_values = opset.reduce_max( + weight, reduction_axes=reduction_axes, keep_dims=True + ) # [a1, r, a2] -> [a1, 1, a2] + min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) + + level_low = 0 + level_high = 2**num_bits - 1 + levels = level_high - level_low + 1 + scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32) + scale = opset.select(opset.abs(scale) < eps, eps, scale) + + zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) + zero_point = opset.clamp(zero_point, level_low, level_high) + else: + zero_point = None + level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32) + + w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) + w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) + w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) + + scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max) + scale /= level_high + scale = opset.select(opset.abs(scale) < eps, eps, scale) + + return _get_compress_model( + config, + ov_model_params, + ov_parameters, + weight, + scale, + zero_point, + return_nodes, + ) + + +def _get_compress_model( + config: WeightCompressionConfig, + ov_model_params: OVModelParameters, + ov_parameters: List[ov._pyopenvino.op.Parameter], + w: ov.runtime.Node, + s: ov.runtime.Node, + zp: Optional[ov.runtime.Node] = None, + return_nodes: Optional[bool] = False, +): + if w.get_element_type() != ov.Type.f32: + w = opset.convert(w, ov.Type.f32) + + compressed_w = w / s + + num_bits = config.num_bits + if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: + # dtype = ov.Type.u8 + dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 + level_low = 0 + level_high = 2**num_bits - 1 + compressed_w += opset.convert(zp, ov.Type.f32) + elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: + # dtype = ov.Type.i8 + dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4 + level_low = -(2 ** (num_bits - 1)) + level_high = 2 ** (num_bits - 1) - 1 + else: + raise Exception + + compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high) + compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights") + + ov_results = [compressed_w] + if len(ov_parameters) == 1: + ov_results.append(s) + if zp is not None: + ov_results.append(opset.convert(zp, compressed_w.get_element_type())) + + if return_nodes: + return ov_parameters, ov_results + + model = ov.Model(ov_results, ov_parameters) + compiled_model = ov.compile_model(model, device_name="CPU") + + def infer(inputs): + infer_request = compiled_model.create_infer_request() + infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs) + outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] + if ov_model_params.release_memory: + compiled_model.release_memory() + return outputs + + return infer + + +def _get_compress_decompress_model( + config: WeightCompressionConfig, + ov_model_params: OVModelParameters, + parameters: List[ov._pyopenvino.op.Parameter], + results: List[ov._pyopenvino.Node], +): + if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: + if len(results) == 1: + compressed_w = results[0] + s, zp = parameters[1], parameters[2] + else: + compressed_w, s, zp = results + decompressed_w = (compressed_w - zp) * s + else: + if len(results) == 1: + compressed_w = results[0] + s = parameters[1] + else: + compressed_w, s = results + decompressed_w = compressed_w * s + + model = ov.Model([decompressed_w], parameters) + compiled_model = ov.compile_model(model, device_name="CPU") + + def infer(inputs): + infer_request = compiled_model.create_infer_request() + infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs) + outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] + if ov_model_params.release_memory: + compiled_model.release_memory() + return outputs + + return infer diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 0596e94d432..1b4827038c9 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -27,6 +27,8 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization @@ -44,8 +46,6 @@ class ScaleEstimation: Scale estimation algorithm implementation. """ - compress_decompress_cache = {} - def __init__( self, model: TModel, @@ -256,41 +256,20 @@ def calculate_quantization_params( if weight_penalty > 0.0: min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - zp_shape = zp.shape if zp is not None else None - key = (config.mode, config.num_bits) + q_weights.shape + scale.shape - if zp is not None: - key += zp_shape - if config.mode != CompressWeightsMode.NF4: - if key in ScaleEstimation.compress_decompress_cache: - compress_decompress_model = ScaleEstimation.compress_decompress_cache[key]["compress_decompress_model"] - compress_model = ScaleEstimation.compress_decompress_cache[key]["compress_model"] - else: - compress_decompress_model = backend_entity.get_compress_decompress_pipeline( - config, q_weights.shape, scale.shape, zp_shape - ) - compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape) - ScaleEstimation.compress_decompress_cache[key] = { - "compress_decompress_model": compress_decompress_model, - "compress_model": compress_model, - } scale_sign = scale / fns.abs(scale) zero_scale = 0.001 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - input_tensors = [original_weight.data, None] - if zp is not None: - input_tensors.append(zp.data) # iterative rectification of initial scale for i in range(initial_steps): near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data if config.mode == CompressWeightsMode.NF4: g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: - out = compress_decompress_model(input_tensors) + out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp) q_weights_ = fns.zeros_like(original_weight) + out q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) @@ -313,13 +292,12 @@ def calculate_quantization_params( else: near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale result_scale = near_to_ideal_scale - input_tensors[1] = near_to_ideal_scale.data if i < initial_steps - 1: if config.mode == CompressWeightsMode.NF4: out = do_nf4_quantization(original_weight, near_to_ideal_scale) else: - out = compress_model(input_tensors) + out = calculate_quantized_weight(original_weight, config, near_to_ideal_scale, zp) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) @@ -333,7 +311,7 @@ def calculate_quantization_params( if config.mode == CompressWeightsMode.NF4: out = do_nf4_quantization(original_weight, scaled_scale) else: - out = compress_model(input_tensors) + out = calculate_quantized_weight(original_weight, config, scaled_scale, zp) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) @@ -346,7 +324,7 @@ def calculate_quantization_params( g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: - out = compress_decompress_model(input_tensors) + out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp) q_weights_ = fns.zeros_like(original_weight) + out q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 342725c0237..08aff97d5cd 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -17,10 +17,13 @@ import nncf from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, \ + get_compress_decompress_weight_model, get_compress_weight_model from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns from nncf.tensor.definitions import TensorDataType +from nncf.utils import is_openvino_available ReductionAxes = Tuple[int, ...] @@ -139,7 +142,7 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val= return scale -def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4) -> Tensor: +def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division=False) -> Tensor: """ Calculates the signed scale for symmetric quantization. @@ -154,7 +157,10 @@ def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bi w_max = fns.max(weight, axis=reduction_axes, keepdims=True) scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max) - scale /= level_high + if invert_division: + scale *= 1.0 / level_high + else: + scale /= level_high eps = fns.finfo(scale).eps scale = fns.where(fns.abs(scale) < eps, eps, scale) @@ -249,7 +255,7 @@ def calculate_normalized_weight_and_fp4_scale( def calculate_integer_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False ) -> Tuple[Tensor, Tensor]: """ Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into @@ -273,7 +279,7 @@ def calculate_integer_quantization_params( min_values = fns.min(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] max_values = fns.max(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] scale, zero_point = calculate_scale_zero_point( - min_values, max_values, level_low, level_high, narrow_range=False + min_values, max_values, level_low, level_high, narrow_range=False, invert_division=invert_division ) return scale, zero_point @@ -286,7 +292,7 @@ def calculate_quantized_weight( config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None, - invert_scale=False, + invert_division=False, ) -> Tensor: """ Quantizes the weight tensor using the provided scale and zero point. @@ -295,7 +301,7 @@ def calculate_quantized_weight( :param config: Weight compression configuration. :param scale: Scale tensor used for quantization. :param zero_point: Zero point tensor used for quantization. - :param invert_scale: applies inversion for scale and then multiply by weights instead of division. + :param invert_division: applies inversion for scale and then multiply by weights instead of division. :return: Quantized weight tensor of uint8 or int8 type. """ if weight.dtype != TensorDataType.float32: @@ -309,9 +315,8 @@ def calculate_quantized_weight( level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - if invert_scale: - scale = fns.power(scale, -1) - compressed_weights = weight * scale + if invert_division: + compressed_weights = weight * (1.0 / scale) else: compressed_weights = weight / scale if zero_point is not None: @@ -322,63 +327,8 @@ def calculate_quantized_weight( return compressed_weights -def do_int_quantization( - weight: Tensor, - reduction_axes: ReductionAxes, - config: WeightCompressionConfig, - precomputed_scale: Tensor = None, - precomputed_zero_point: Tensor = None, - invert_scale=False, -) -> Tuple[Tensor, Tensor, Tensor]: - """ - The method quantizes the given weights to integer data type uniformly in accordance with the compression config. - The config defines a quantization mode: - INT8_SYM mode refers to signed int8 symmetric weight compression without zero point - - quantization to [-128, 127] range. - INT8_ASYM mode refers to unsigned int8 asymmetric weight compression with a typical non-fixed zero-point - - quantization to [0, 255] range. - INT4_ASYM mode refers to unsigned int4 asymmetric weight compression with a typical non-fixed zero-point - - quantization to [0, 15] range. - INT4_SYM mode refers to signed int4 symmetric weight compression without zero point - - quantization to [-8, 7] range. - NF4 or E2M1 mode requires a dedicated procedure and it is not supported in this method. - One of the parameter of compression config is a group size. Quantization is per-channel, if group size equals to -1, - otherwise it's per-group, i.e. group size number of weights in the channel dimension share quantization parameters - (scales). - - :param weight: Weight array to compress. - :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). - :param config: Information on how to compress (quantize) a specific weight. - :param precomputed_scale: Precomputed scale. - :param precomputed_zero_point: Precomputed zero point. - :param invert_scale: applies inversion for scale and then multiply by weights instead of division. - Need as reference implementation for OV. - :return: The compressed weights tensor of uint8 (asymmetric mode) or int8 (symmetric mode) type, - scale tensor of float32 type and zero point tensor of int32 type that was used for its quantization. - """ - assert config.is_integer(), "The function supports integer quantization only" - group_size = config.group_size - - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - - if group_size != -1: - # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] - weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) - - if precomputed_zero_point is None or precomputed_zero_point is None: - scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) - if precomputed_scale is not None: - scale = precomputed_scale - if precomputed_zero_point is not None: - zero_point = precomputed_zero_point - - compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_scale) - return compressed_weights, scale, zero_point - - def get_integer_quantization_error( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False ) -> float: """ Calculates a quantity characterizing the difference between floating point weights and fake quantized @@ -394,7 +344,9 @@ def get_integer_quantization_error( if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - compressed_weights, scale, zero_point = do_int_quantization(weight, reduction_axes, config) + compressed_weights, scale, zero_point = do_int_quantization( + weight, reduction_axes, config, invert_division=invert_division + ) decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point) decompressed_weight = decompressed_weight.reshape(orig_shape) @@ -410,6 +362,7 @@ def compress_weight( config: WeightCompressionConfig, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, + invert_division=False, ): """ Compress weight using compression configuration. @@ -427,7 +380,7 @@ def compress_weight( ) return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_int_quantization( - weight, reduction_axes, config, precomputed_scale, precomputed_zero_point + weight, reduction_axes, config, precomputed_scale, precomputed_zero_point, invert_division=invert_division ) return CompressedWeight(compressed_weight, scale, zero_point) @@ -472,3 +425,99 @@ def do_int_dequantization( decompressed_weight = ungroup_weights(decompressed_weight, reduction_axis) return decompressed_weight + + +def do_int_quantization( + weight: Tensor, + reduction_axes: Tuple[int, ...], + config: WeightCompressionConfig, + precomputed_scale: Tensor = None, + precomputed_zero_point: Tensor = None, + invert_division: Optional[bool] = False, + ov_model_params: Optional[OVModelParameters] = None, +): + assert config.is_integer(), "The function supports integer quantization only" + + accelerate_through_ov = is_openvino_available() + + if not accelerate_through_ov: + group_size = config.group_size + + if weight.dtype != TensorDataType.float32: + weight = weight.astype(TensorDataType.float32) + + if group_size != -1: + # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] + weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) + + scale, zero_point = None, None + if precomputed_zero_point is None or precomputed_zero_point is None: + scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) + if precomputed_scale is not None: + scale = precomputed_scale + if precomputed_zero_point is not None: + zero_point = precomputed_zero_point + + compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) + return compressed_weights, scale, zero_point + + weight_shape = weight.shape + scale_shape = None if precomputed_scale is None else precomputed_scale.shape + zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape + + if ov_model_params is None: + ov_model_params = OVModelParameters() + # TODO: Try reshaping weight before inputing it to the model + if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: + ov_model_params.dynamic = False + + model = get_compress_weight_model( + config, + weight_shape, + scale_shape, + zero_point_shape, + reduction_axes, + ov_model_params, + ) + + if precomputed_scale is None: + results = model(weight.data) + compressed_weight, scale, zero_point = [Tensor(it) for it in results] + else: + inputs = [weight.data, precomputed_scale.data] + if precomputed_zero_point is not None: + inputs += [precomputed_zero_point.data] + compressed_weight = Tensor(model(inputs)[0]) + scale, zero_point = precomputed_scale, precomputed_zero_point + + return compressed_weight, scale, zero_point + + +def calculate_quantized_dequantized_weight( + weight: Tensor, config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None, + invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None, +) -> Tensor: + accelerate_through_ov = is_openvino_available() + + if not accelerate_through_ov: + compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) + decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) + return decompressed_weight + + weight_shape = weight.shape + scale_shape = scale.shape + zero_point_shape = None if zero_point is None else zero_point.shape + + if ov_model_params is None: + ov_model_params = OVModelParameters() + if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: + ov_model_params.dynamic = False + + model = get_compress_decompress_weight_model(config, weight_shape, scale_shape, zero_point_shape, ov_model_params) + + inputs = [weight.data, scale.data] + if zero_point is not None: + inputs.append(zero_point.data) + results = model(inputs) + decompressed_weight = [Tensor(it) for it in results][0] + return decompressed_weight diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py index d5a3e96ae64..a225f53853a 100644 --- a/nncf/quantization/fake_quantize.py +++ b/nncf/quantization/fake_quantize.py @@ -11,7 +11,7 @@ import warnings from dataclasses import dataclass -from typing import Tuple +from typing import Optional, Tuple import nncf from nncf.common.quantization.quantizers import calculate_asymmetric_level_ranges @@ -339,7 +339,12 @@ def _calculate_scaled_parameters( def calculate_scale_zero_point( - input_low: Tensor, input_high: Tensor, level_low: int, level_high: int, narrow_range: bool + input_low: Tensor, + input_high: Tensor, + level_low: int, + level_high: int, + narrow_range: bool, + invert_division: Optional[bool] = False, ) -> Tuple[Tensor, Tensor]: """ Calculates scale and zero_point values for the quantizer. @@ -355,11 +360,17 @@ def calculate_scale_zero_point( :return: Scale and Zero point values. """ levels = level_high - level_low if narrow_range else level_high - level_low + 1 - scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32) + if invert_division: + scale = ((input_high - input_low) * (1.0 / (levels - 1))).astype(TensorDataType.float32) + else: + scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32) eps = fns.finfo(scale).eps # NOTE: adding machine epsilon to avoid division by zero scale = fns.where(fns.abs(scale) < eps, eps, scale) expected_level_low = level_low + 1 if narrow_range else level_low - zero_point = expected_level_low - fns.round(input_low / scale) + if invert_division: + zero_point = expected_level_low - fns.round(input_low * (1.0 / scale)) + else: + zero_point = expected_level_low - fns.round(input_low / scale) zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high) return scale, zero_point diff --git a/nncf/tensor/definitions.py b/nncf/tensor/definitions.py index 5d2df4ac035..a4849e558e3 100644 --- a/nncf/tensor/definitions.py +++ b/nncf/tensor/definitions.py @@ -60,6 +60,7 @@ class TensorBackend(Enum): numpy = auto() torch = auto() + ov = auto() @dataclass diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py index 5a286a6fc13..9affab79c90 100644 --- a/nncf/tensor/functions/__init__.py +++ b/nncf/tensor/functions/__init__.py @@ -75,5 +75,8 @@ def _initialize_backends(): import nncf.tensor.functions.torch_linalg import nncf.tensor.functions.torch_numeric # noqa: F401 + with contextlib.suppress(ImportError): + import nncf.tensor.functions.ov # noqa: F401 + _initialize_backends() diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py new file mode 100644 index 00000000000..32bc615d30b --- /dev/null +++ b/nncf/tensor/functions/ov.py @@ -0,0 +1,41 @@ +import numpy as np +import openvino as ov + +from nncf.tensor import TensorDataType +from nncf.tensor.functions import numeric + +from ..definitions import TensorBackend +from .numpy_numeric import DTYPE_MAP as NP_DTYPE_MAP + +DTYPE_MAP = { + TensorDataType.float16: ov.Type.f16, + TensorDataType.bfloat16: ov.Type.bf16, + TensorDataType.float32: ov.Type.f32, + TensorDataType.float64: ov.Type.f64, + TensorDataType.int8: ov.Type.i8, + TensorDataType.int32: ov.Type.i32, + TensorDataType.int64: ov.Type.i64, + TensorDataType.uint8: ov.Type.u8, +} + +DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()} + + +@numeric.backend.register(ov.Tensor) +def _(a: ov.Tensor) -> TensorBackend: + return TensorBackend.ov + + +@numeric.astype.register(ov.Tensor) +def _(a: ov.Tensor, dtype: TensorDataType) -> np.ndarray: + return a.data.astype(NP_DTYPE_MAP[dtype]) + + +@numeric.dtype.register(ov.Tensor) +def _(a: ov.Tensor) -> TensorDataType: + return DTYPE_MAP_REV[a.get_element_type()] + + +@numeric.size.register(ov.Tensor) +def _(a: ov.Tensor) -> int: + return a.size diff --git a/nncf/utils.py b/nncf/utils.py new file mode 100644 index 00000000000..50a315e4048 --- /dev/null +++ b/nncf/utils.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib + +_openvino_available = importlib.util.find_spec("openvino") is not None +_openvino_version = "N/A" +if _openvino_available: + try: + from openvino.runtime import get_version + + version = get_version() + # avoid invalid format + if "-" in version: + ov_major_version, dev_info = version.split("-", 1) + commit_id = dev_info.split("-")[0] + version = f"{ov_major_version}-{commit_id}" + _openvino_version = version + except ImportError: + _openvino_available = False + + +def is_openvino_available(): + return _openvino_available diff --git a/run_weight_compression.py b/run_weight_compression.py new file mode 100644 index 00000000000..0413034449d --- /dev/null +++ b/run_weight_compression.py @@ -0,0 +1,373 @@ +import os +import shutil +import subprocess +import threading +import time +from pathlib import Path + + +def stream_handler(stream, target_file): + for line in iter(stream.readline, ''): + print(line, end='') + target_file.write(line) + + +parent_model_dir = Path("/home/nsavel/workspace/openvino.genai/llm_bench/python/models") +parent_log_dir = Path("compression_logs") + +experiment_params = [ + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), + # + # + # + # + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), + (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + # + # + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), + # + # + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), +] + +for model_dir, log_dir, params in experiment_params: + model_path = model_dir / "openvino_model.xml" + cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}" + + log_dir.mkdir(parents=True, exist_ok=True) + with open(log_dir / "log.txt", "a") as log_file: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + universal_newlines=True, + preexec_fn=os.setsid, + ) + + stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file)) + stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file)) + + stdout_thread.start() + stderr_thread.start() + + stdout_thread.join() + stderr_thread.join() + + process.wait() + time.sleep(10) + +evaluated_paths = set() +for _, log_dir, _ in experiment_params: + for model_path in log_dir.rglob("**/*"): + model_path: Path + if model_path.suffix != ".xml": + continue + if model_path.absolute() in evaluated_paths: + continue + evaluated_paths.add(model_path.absolute()) + + model_dir = model_path.parent.absolute() + cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}" + process = subprocess.Popen(cmd, shell=True) + process.wait() diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index edc50652710..5d89c75e542 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -27,6 +27,7 @@ from nncf.data.dataset import Dataset from nncf.experimental.common.tensor_statistics.collectors import AggregatorBase from nncf.openvino.graph.node_utils import get_const_value +from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE from nncf.parameters import BackupMode from nncf.quantization import compress_weights from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams @@ -35,7 +36,6 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA -from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error @@ -1038,8 +1038,8 @@ def test_np_ov_compression_decompression(mode): decompressed_weighs = decompressed_weighs.data zp_shape = zp.shape if zp is not None else None - compress = OVWeightCompressionAlgoBackend.get_compress_pipeline(config, w.shape, scale.shape, zp_shape) - compress_decompress = OVWeightCompressionAlgoBackend.get_compress_decompress_pipeline( + compress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive(config, w.shape, scale.shape, zp_shape) + compress_decompress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_decompress_weight_primitive( config, w.shape, scale.shape, zp_shape ) diff --git a/weight_compression.py b/weight_compression.py new file mode 100644 index 00000000000..bb6921e3558 --- /dev/null +++ b/weight_compression.py @@ -0,0 +1,234 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gc +import os +import shutil +import time +from functools import partial +from pathlib import Path + +import openvino as ov + +import nncf +from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE +from tools.memory_monitor import MemoryMonitor +from tools.memory_monitor import MemoryType + + +def parse_arguments(): + parser = argparse.ArgumentParser() + + parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored") + + parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved") + + parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode") + + parser.add_argument("--numpy", action="store_true", help="Enable numpy compression") + + parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models") + + parser.add_argument("--end-to-end", action="store_true", help="Enable end-to-end OV compression") + + parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype") + + parser.add_argument("--fp32-output", action="store_true", help="Output in fp32 instead of (u)int8") + + parser.add_argument("--recompile", action="store_true", help="Recompile model every time") + + parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs") + + parser.add_argument("--save-model", action="store_true", help="Save compressed model") + + parser.add_argument("--compare-with-numpy", action="store_true", help="Compare compressed weight with the one computed with NumPy") + + parser.add_argument("--invert-numpy-division", action="store_true", help="Invert division when compressing with NumPy") + + parser.add_argument("--release-memory", action="store_true", help="Release memory") + + return parser.parse_args() + + +def log(mm, fz, log_dir): + mm.save_memory_logs( + *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else "" + ) + + +def count_node_dtypes(model): + # Get the main dtype of weight constants + node_count_per_dtype = dict(f32=0, f16=0, bf16=0) + for node in model.get_ordered_ops(): + friendly_name = node.get_friendly_name() + if node.get_type_name() != "Constant" or ".weight" not in friendly_name: + continue + const_dtype = node.get_element_type().get_type_name() + if const_dtype in node_count_per_dtype: + node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1 + return node_count_per_dtype + + +def main(args): + model_path = Path(args.model_path) + log_dir = Path(args.log_dir) + + numpy_compression = args.numpy + dynamic_compression = args.dynamic + end_to_end_compression = args.end_to_end + input_dtype = args.input_dtype + fp32_output = args.fp32_output + recompile = args.recompile + share_outputs = args.share_outputs + save_model = args.save_model + compare_with_numpy = args.compare_with_numpy + invert_numpy_division = args.invert_numpy_division or compare_with_numpy + release_memory = args.release_memory + + log_dir_suffix = f"{model_path.parent.name}_" + if numpy_compression: + log_dir_suffix = f"{log_dir_suffix}numpy" + if invert_numpy_division: + log_dir_suffix += "_inverted" + else: + log_dir_suffix = f"{log_dir_suffix}{'end-to-end_' if end_to_end_compression else ''}" + log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}" + log_dir_suffix = f"{log_dir_suffix}_{'output-fp32' if fp32_output else 'output-i8'}" + if input_dtype is not None: + log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" + if recompile: + log_dir_suffix = f"{log_dir_suffix}_recompile" + if release_memory: + log_dir_suffix = f"{log_dir_suffix}_release-memory" + if share_outputs: + log_dir_suffix = f"{log_dir_suffix}_share-outputs" + print(f"Log dir suffix: {log_dir_suffix}") + + memory_monitors = [] + for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]: + memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0)) + memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix)) + memory_monitors.append(memory_monitor) + + core = ov.Core() + # core.set_property({"ENABLE_MMAP": "NO"}) + model = core.read_model(model_path) + + node_count_per_dtype = count_node_dtypes(model) + assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type" + node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True) + model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]] + + # Update input dtype based on model + input_dtype = input_dtype or model_dtype + + os.environ["MODEL_PATH"] = str(model_path) + os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" + os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" + os.environ["END_TO_END_COMPRESSION"] = f"{int(end_to_end_compression)}" + os.environ["INPUT_DTYPE"] = input_dtype + os.environ["FP32_OUTPUT"] = f"{int(fp32_output)}" + os.environ["RECOMPILE"] = f"{int(recompile)}" + os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}" + os.environ["COMPARE_WITH_NUMPY"] = f"{int(compare_with_numpy)}" + os.environ["INVERT_NUMPY_DIVISION"] = f"{int(invert_numpy_division)}" + os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}" + + start_time = time.perf_counter() + if args.compression_mode == "int8_asym": + compression_mode = nncf.CompressWeightsMode.INT8_ASYM + elif args.compression_mode == "int8_sym": + compression_mode = nncf.CompressWeightsMode.INT8_SYM + elif args.compression_mode == "int4_asym": + compression_mode = nncf.CompressWeightsMode.INT4_ASYM + elif args.compression_mode == "int4_sym": + compression_mode = nncf.CompressWeightsMode.INT4_SYM + else: + raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}") + compressed_model = nncf.compress_weights(model, mode=compression_mode) + compression_time = time.perf_counter() - start_time + print(f"Compression Time: {compression_time:.2f} sec.") + + if save_model: + ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml") + for filepath in model_path.parent.glob("*.json"): + shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name)) + + del core + del model + del compressed_model + gc.collect() + time.sleep(0.5) + + before_cache_deletion = memory_monitors[2].get_data(True)[1][-1] + if not COMPILED_MODEL_CACHE.is_empty(): + COMPILED_MODEL_CACHE.clear() + gc.collect() + time.sleep(memory_monitors[0].interval * 10) + after_cache_deletion = memory_monitors[2].get_data(True)[1][-1] + else: + after_cache_deletion = before_cache_deletion + cache_size = before_cache_deletion - after_cache_deletion + print(f"Cache size: {cache_size:.2f} MiB") + + time.sleep(memory_monitors[0].interval * 10) + + leftover_memory = memory_monitors[2].get_data(True)[1][-1] + peak_memory = max(memory_monitors[2].get_data(True)[1]) + print(f"Peak memory: {peak_memory:.2f} MiB") + print(f"Leftover memory: {leftover_memory:.2f} MiB") + print("Done") + + csv_path = log_dir / "results.csv" + csv_exists = csv_path.exists() + csv_path.parent.mkdir(exist_ok=True, parents=True) + with open(csv_path, "a") as f: + if not csv_exists: + f.write( + "Model Path," + "Model dtype," + "Backend," + "End to end," + "Recompile," + "Release memory," + "Share outputs," + "Input Shapes," + "Input," + "Output," + "Compression Time," + "Peak Memory," + "Cache Size," + "Leftover Memory" + "\n" + ) + f.write( + f"{model_path}," + f"{model_dtype.upper()}," + f"{'NumPy' if numpy_compression else 'OV'}," + f"{'-' if numpy_compression else end_to_end_compression}," + f"{'-' if numpy_compression else recompile}," + f"{'-' if numpy_compression else release_memory}," + f"{'-' if numpy_compression else share_outputs}," + f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'}," + f"{'-' if numpy_compression else input_dtype.upper()}," + f"{'-' if numpy_compression else 'FP32' if fp32_output else 'INT8'}," + f"{compression_time:.2f}," + f"{peak_memory:.2f}," + f"{cache_size:.2f}," + f"{leftover_memory:.2f}" + f"\n" + ) + + +if __name__ == "__main__": + args = parse_arguments() + main(args) From bd2629b181f7b5ab7b4642a58a3301edd3e9b57e Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 22 Oct 2024 16:26:23 +0200 Subject: [PATCH 02/73] Unstage helper scripts --- .../weight_compression/openvino_modeling.py | 12 - .../weight_compression/scale_estimation.py | 2 - .../weight_compression/weight_lowering.py | 13 +- nncf/tensor/functions/ov.py | 11 + run_weight_compression.py | 373 ------------------ weight_compression.py | 234 ----------- 6 files changed, 20 insertions(+), 625 deletions(-) delete mode 100644 run_weight_compression.py delete mode 100644 weight_compression.py diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index b4443970e30..afd31b8215c 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -9,18 +9,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import inspect from dataclasses import dataclass from typing import List, Optional, Tuple diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 1b4827038c9..e294c6e0f5d 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -307,7 +307,6 @@ def calculate_quantization_params( factor = 1.0 - 0.05 * scale_step scaled_scale = factor * scale - input_tensors[1] = scaled_scale.data if config.mode == CompressWeightsMode.NF4: out = do_nf4_quantization(original_weight, scaled_scale) else: @@ -319,7 +318,6 @@ def calculate_quantization_params( near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data if config.mode == CompressWeightsMode.NF4: g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 08aff97d5cd..87fe07d569e 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -17,8 +17,9 @@ import nncf from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, \ - get_compress_decompress_weight_model, get_compress_weight_model +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns @@ -494,8 +495,12 @@ def do_int_quantization( def calculate_quantized_dequantized_weight( - weight: Tensor, config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None, - invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None, + weight: Tensor, + config: WeightCompressionConfig, + scale: Tensor, + zero_point: Optional[Tensor] = None, + invert_division: Optional[bool] = False, + ov_model_params: Optional[OVModelParameters] = None, ) -> Tensor: accelerate_through_ov = is_openvino_available() diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index 32bc615d30b..cd094e7a0e0 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -1,3 +1,14 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import openvino as ov diff --git a/run_weight_compression.py b/run_weight_compression.py deleted file mode 100644 index 0413034449d..00000000000 --- a/run_weight_compression.py +++ /dev/null @@ -1,373 +0,0 @@ -import os -import shutil -import subprocess -import threading -import time -from pathlib import Path - - -def stream_handler(stream, target_file): - for line in iter(stream.readline, ''): - print(line, end='') - target_file.write(line) - - -parent_model_dir = Path("/home/nsavel/workspace/openvino.genai/llm_bench/python/models") -parent_log_dir = Path("compression_logs") - -experiment_params = [ - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"), - # - # - # - # - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"), - (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # - # - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"), - # - # - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"), -] - -for model_dir, log_dir, params in experiment_params: - model_path = model_dir / "openvino_model.xml" - cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}" - - log_dir.mkdir(parents=True, exist_ok=True) - with open(log_dir / "log.txt", "a") as log_file: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - universal_newlines=True, - preexec_fn=os.setsid, - ) - - stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file)) - stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file)) - - stdout_thread.start() - stderr_thread.start() - - stdout_thread.join() - stderr_thread.join() - - process.wait() - time.sleep(10) - -evaluated_paths = set() -for _, log_dir, _ in experiment_params: - for model_path in log_dir.rglob("**/*"): - model_path: Path - if model_path.suffix != ".xml": - continue - if model_path.absolute() in evaluated_paths: - continue - evaluated_paths.add(model_path.absolute()) - - model_dir = model_path.parent.absolute() - cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}" - process = subprocess.Popen(cmd, shell=True) - process.wait() diff --git a/weight_compression.py b/weight_compression.py deleted file mode 100644 index bb6921e3558..00000000000 --- a/weight_compression.py +++ /dev/null @@ -1,234 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -import shutil -import time -from functools import partial -from pathlib import Path - -import openvino as ov - -import nncf -from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE -from tools.memory_monitor import MemoryMonitor -from tools.memory_monitor import MemoryType - - -def parse_arguments(): - parser = argparse.ArgumentParser() - - parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored") - - parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved") - - parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode") - - parser.add_argument("--numpy", action="store_true", help="Enable numpy compression") - - parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models") - - parser.add_argument("--end-to-end", action="store_true", help="Enable end-to-end OV compression") - - parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype") - - parser.add_argument("--fp32-output", action="store_true", help="Output in fp32 instead of (u)int8") - - parser.add_argument("--recompile", action="store_true", help="Recompile model every time") - - parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs") - - parser.add_argument("--save-model", action="store_true", help="Save compressed model") - - parser.add_argument("--compare-with-numpy", action="store_true", help="Compare compressed weight with the one computed with NumPy") - - parser.add_argument("--invert-numpy-division", action="store_true", help="Invert division when compressing with NumPy") - - parser.add_argument("--release-memory", action="store_true", help="Release memory") - - return parser.parse_args() - - -def log(mm, fz, log_dir): - mm.save_memory_logs( - *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else "" - ) - - -def count_node_dtypes(model): - # Get the main dtype of weight constants - node_count_per_dtype = dict(f32=0, f16=0, bf16=0) - for node in model.get_ordered_ops(): - friendly_name = node.get_friendly_name() - if node.get_type_name() != "Constant" or ".weight" not in friendly_name: - continue - const_dtype = node.get_element_type().get_type_name() - if const_dtype in node_count_per_dtype: - node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1 - return node_count_per_dtype - - -def main(args): - model_path = Path(args.model_path) - log_dir = Path(args.log_dir) - - numpy_compression = args.numpy - dynamic_compression = args.dynamic - end_to_end_compression = args.end_to_end - input_dtype = args.input_dtype - fp32_output = args.fp32_output - recompile = args.recompile - share_outputs = args.share_outputs - save_model = args.save_model - compare_with_numpy = args.compare_with_numpy - invert_numpy_division = args.invert_numpy_division or compare_with_numpy - release_memory = args.release_memory - - log_dir_suffix = f"{model_path.parent.name}_" - if numpy_compression: - log_dir_suffix = f"{log_dir_suffix}numpy" - if invert_numpy_division: - log_dir_suffix += "_inverted" - else: - log_dir_suffix = f"{log_dir_suffix}{'end-to-end_' if end_to_end_compression else ''}" - log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}" - log_dir_suffix = f"{log_dir_suffix}_{'output-fp32' if fp32_output else 'output-i8'}" - if input_dtype is not None: - log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" - if recompile: - log_dir_suffix = f"{log_dir_suffix}_recompile" - if release_memory: - log_dir_suffix = f"{log_dir_suffix}_release-memory" - if share_outputs: - log_dir_suffix = f"{log_dir_suffix}_share-outputs" - print(f"Log dir suffix: {log_dir_suffix}") - - memory_monitors = [] - for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]: - memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0)) - memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix)) - memory_monitors.append(memory_monitor) - - core = ov.Core() - # core.set_property({"ENABLE_MMAP": "NO"}) - model = core.read_model(model_path) - - node_count_per_dtype = count_node_dtypes(model) - assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type" - node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True) - model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]] - - # Update input dtype based on model - input_dtype = input_dtype or model_dtype - - os.environ["MODEL_PATH"] = str(model_path) - os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" - os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" - os.environ["END_TO_END_COMPRESSION"] = f"{int(end_to_end_compression)}" - os.environ["INPUT_DTYPE"] = input_dtype - os.environ["FP32_OUTPUT"] = f"{int(fp32_output)}" - os.environ["RECOMPILE"] = f"{int(recompile)}" - os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}" - os.environ["COMPARE_WITH_NUMPY"] = f"{int(compare_with_numpy)}" - os.environ["INVERT_NUMPY_DIVISION"] = f"{int(invert_numpy_division)}" - os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}" - - start_time = time.perf_counter() - if args.compression_mode == "int8_asym": - compression_mode = nncf.CompressWeightsMode.INT8_ASYM - elif args.compression_mode == "int8_sym": - compression_mode = nncf.CompressWeightsMode.INT8_SYM - elif args.compression_mode == "int4_asym": - compression_mode = nncf.CompressWeightsMode.INT4_ASYM - elif args.compression_mode == "int4_sym": - compression_mode = nncf.CompressWeightsMode.INT4_SYM - else: - raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}") - compressed_model = nncf.compress_weights(model, mode=compression_mode) - compression_time = time.perf_counter() - start_time - print(f"Compression Time: {compression_time:.2f} sec.") - - if save_model: - ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml") - for filepath in model_path.parent.glob("*.json"): - shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name)) - - del core - del model - del compressed_model - gc.collect() - time.sleep(0.5) - - before_cache_deletion = memory_monitors[2].get_data(True)[1][-1] - if not COMPILED_MODEL_CACHE.is_empty(): - COMPILED_MODEL_CACHE.clear() - gc.collect() - time.sleep(memory_monitors[0].interval * 10) - after_cache_deletion = memory_monitors[2].get_data(True)[1][-1] - else: - after_cache_deletion = before_cache_deletion - cache_size = before_cache_deletion - after_cache_deletion - print(f"Cache size: {cache_size:.2f} MiB") - - time.sleep(memory_monitors[0].interval * 10) - - leftover_memory = memory_monitors[2].get_data(True)[1][-1] - peak_memory = max(memory_monitors[2].get_data(True)[1]) - print(f"Peak memory: {peak_memory:.2f} MiB") - print(f"Leftover memory: {leftover_memory:.2f} MiB") - print("Done") - - csv_path = log_dir / "results.csv" - csv_exists = csv_path.exists() - csv_path.parent.mkdir(exist_ok=True, parents=True) - with open(csv_path, "a") as f: - if not csv_exists: - f.write( - "Model Path," - "Model dtype," - "Backend," - "End to end," - "Recompile," - "Release memory," - "Share outputs," - "Input Shapes," - "Input," - "Output," - "Compression Time," - "Peak Memory," - "Cache Size," - "Leftover Memory" - "\n" - ) - f.write( - f"{model_path}," - f"{model_dtype.upper()}," - f"{'NumPy' if numpy_compression else 'OV'}," - f"{'-' if numpy_compression else end_to_end_compression}," - f"{'-' if numpy_compression else recompile}," - f"{'-' if numpy_compression else release_memory}," - f"{'-' if numpy_compression else share_outputs}," - f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'}," - f"{'-' if numpy_compression else input_dtype.upper()}," - f"{'-' if numpy_compression else 'FP32' if fp32_output else 'INT8'}," - f"{compression_time:.2f}," - f"{peak_memory:.2f}," - f"{cache_size:.2f}," - f"{leftover_memory:.2f}" - f"\n" - ) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) From 3e6925240ac5c6f04eedcb7203b6844e29979bc6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 23 Oct 2024 17:14:06 +0200 Subject: [PATCH 03/73] WIP --- nncf/openvino/graph/node_utils.py | 6 +- .../weight_compression/openvino_backend.py | 44 +-- .../weight_compression/openvino_modeling.py | 107 +++--- .../weight_compression/weight_lowering.py | 51 ++- nncf/tensor/functions/numeric.py | 21 ++ run_weight_compression.py | 318 ++++++++++++++++++ weight_compression.py | 209 ++++++++++++ 7 files changed, 662 insertions(+), 94 deletions(-) create mode 100644 run_weight_compression.py create mode 100644 weight_compression.py diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 8fab3933945..17213204268 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int: return cnt_if_op(model, 0) -def get_const_value(const_node: ov.Node) -> np.ndarray: +def get_const_value(const_node: ov.Node, bf16_to_fp32: Optional[bool] = True) -> np.ndarray: """ Returns the constant tensor for the node. This method is applicable only for the floating-point constant data. @@ -115,9 +115,7 @@ def get_const_value(const_node: ov.Node) -> np.ndarray: :param const_node: OpenVINO node. :return: The constant value. """ - INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32") - NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) - if const_node.get_element_type() == ov.Type.bf16 and (INPUT_DTYPE != "bf16" or NUMPY_COMPRESSION): + if const_node.get_element_type() == ov.Type.bf16 and bf16_to_fp32: # Fixed FP32 data type as the result for BF16 constant return const_node.get_data(dtype=np.float32) return const_node.data diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index c00cb82a3f2..3caaaa1b4f9 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -14,6 +14,7 @@ from openvino import Type from openvino.properties.hint import inference_precision from openvino.runtime import opset13 as opset +from openvino.runtime.op import Constant import nncf from nncf.common.graph import NNCFGraph @@ -49,7 +50,7 @@ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor -from nncf.tensor.definitions import TensorDataType +from nncf.tensor.definitions import TensorDataType, TensorBackend class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): @@ -244,37 +245,19 @@ def _create_compression_subgraph( original_shape = weight.shape compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points) - compressed_weight_data = compressed_weight.tensor.data - if isinstance(compressed_weight_data, ov.Tensor): - compressed_const = opset.constant(compressed_weight_data, name=const_node_name) - else: - compressed_const = opset.constant(compressed_weight_data, dtype=compression_dtype, name=const_node_name) + compressed_const = self._create_ov_const_from_tensor(compressed_weight.tensor, compression_dtype, name=const_node_name) if compressed_const.get_element_type() != compression_dtype: compressed_const = opset.convert(compressed_const, compression_dtype) converted_const = opset.convert(compressed_const, ov.Type.f16) + if compressed_weight.zero_point is not None: - zero_point_data = compressed_weight.zero_point.data - if isinstance(zero_point_data, ov.Tensor): - zero_point_const = opset.constant( - compressed_weight.zero_point.data, - name=f"{const_node_name}/zero_point", - ) - else: - zero_point_const = opset.constant( - compressed_weight.zero_point.data, - dtype=compression_dtype, - name=f"{const_node_name}/zero_point", - ) + zero_point_const = self._create_ov_const_from_tensor(compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point") zero_point_const = opset.convert(zero_point_const, ov.Type.f16) converted_const = opset.subtract( converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" ) - scale_data = compressed_weight.scale.data - if isinstance(scale_data, ov.Tensor): - scale_const = opset.constant(scale_data, name=f"{const_node_name}/scale") - else: - scale_const = opset.constant(scale_data, dtype=scale_dtype, name=f"{const_node_name}/scale") + scale_const = self._create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale") if scale_const.get_element_type() != ov.Type.f16: scale_const = opset.convert(scale_const, ov.Type.f16) @@ -289,6 +272,8 @@ def _create_compression_subgraph( if should_add_convert_node: mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert") + + # TODO: convert tensors inside compressed_weight to numpy backend if they are in ov backend return mul, compressed_weight def transform_model( @@ -307,6 +292,10 @@ def transform_model( const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() weight = Tensor(get_const_value(const_node)) + # TODO: try to support bf16 by creating a Tensor with OV backend + # weight = Tensor(get_const_value(const_node, bf16_to_fp32=False)) + # if const_dtype == ov.Type.bf16: + # weight._is_bf16 = True should_add_convert_node = False if const_dtype != ov.Type.f16: @@ -319,9 +308,6 @@ def transform_model( layer_zero_points = ( None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name) ) - import os - - os.environ["CURRENT_NODE_NAME"] = wc_params.weight_name mul, compressed_weight = self._create_compression_subgraph( weight=weight, compression_config=wc_params.compression_config, @@ -353,6 +339,12 @@ def dump_parameters( ) -> None: dump_parameters(model, parameters, algo_name, path) + @staticmethod + def _create_ov_const_from_tensor(x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None) -> Constant: + if x.backend == TensorBackend.ov: + return opset.constant(x.data, name=name) + const = opset.constant(x.data, dtype=dtype, name=name) + return const class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): @staticmethod diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index afd31b8215c..52f7c43a167 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -10,7 +10,9 @@ # limitations under the License. import inspect +import os from dataclasses import dataclass +from functools import partial from typing import List, Optional, Tuple import numpy as np @@ -20,18 +22,19 @@ import nncf from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.tensor import TensorDataType, Tensor @dataclass class OVModelParameters: + input_dtype: TensorDataType dynamic: bool = False recompile: bool = False release_memory: bool = True share_outputs: bool = True - input_dtype: str = "fp32" def __hash__(self): - return hash((self.dynamic, self.recompile, self.release_memory, self.share_outputs, self.input_dtype)) + return hash((self.input_dtype, self.dynamic, self.recompile, self.release_memory, self.share_outputs)) class CompiledModelCache: @@ -58,25 +61,44 @@ def wrapper(*args, **kwargs): new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} new_kwargs.update(kwargs) cache_key = (func.__name__, frozenset(new_kwargs.items())) - recompile = new_kwargs.get("ov_model_params", OVModelParameters()).recompile cache = COMPILED_MODEL_CACHE._cache - if not recompile and cache_key in cache: + if cache_key in cache: return cache[cache_key] result = func(*args, **kwargs) - cache[cache_key] = result + recompile = new_kwargs["ov_model_params"].recompile + if not recompile: + cache[cache_key] = result return result return wrapper -@cache_results +def run_model(ov_model_params, compiled_model, inputs): + # Returns results as numpy tensors + outputs = compiled_model(inputs, share_outputs=ov_model_params.share_outputs) + outputs = [Tensor(outputs[i]) for i in range(len(outputs))] + if ov_model_params.release_memory: + compiled_model.release_memory() + return outputs + + +def run_model_via_infer_request(ov_model_params, compiled_model, inputs): + # Returns results as ov tensors + infer_request = compiled_model.create_infer_request() + infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs) + outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))] + if ov_model_params.release_memory: + compiled_model.release_memory() + return outputs + + def get_compress_weight_model( + ov_model_params: OVModelParameters, config: WeightCompressionConfig, weight_shape: Tuple, scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, - ov_model_params: Optional[OVModelParameters] = None, ): if scale_shape is None and zero_point_shape is not None: raise Exception("Zero point shape can only be provided if scale shape is provided.") @@ -101,16 +123,13 @@ def get_compress_weight_model( ) -@cache_results def get_compress_decompress_weight_model( + ov_model_params: OVModelParameters, config: WeightCompressionConfig, weight_shape: Tuple, scale_shape: Optional[Tuple], zero_point_shape: Optional[Tuple] = None, - ov_model_params: Optional[OVModelParameters] = None, ): - if ov_model_params is None: - ov_model_params = OVModelParameters() if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: ov_model_params.dynamic = False @@ -129,24 +148,7 @@ def get_compress_decompress_weight_model( ) -def _build_compress_decompress_model( - config: WeightCompressionConfig, - ov_model_params: OVModelParameters, - weight_shape: Tuple, - scale_shape: Tuple, - zero_point_shape: Optional[Tuple] = None, -): - ov_parameters, ov_results = _build_compress_model( - config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True - ) - return _get_compress_decompress_model( - config, - ov_model_params, - ov_parameters, - ov_results, - ) - - +@cache_results def _build_compress_model( config: WeightCompressionConfig, ov_model_params: OVModelParameters, @@ -156,11 +158,11 @@ def _build_compress_model( reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, ): - if ov_model_params.input_dtype == "fp32": + if ov_model_params.input_dtype == TensorDataType.float32: input_dtype = ov.Type.f32 - elif ov_model_params.input_dtype == "fp16": + elif ov_model_params.input_dtype == TensorDataType.float16: input_dtype = ov.Type.f16 - elif ov_model_params.input_dtype == "bf16": + elif ov_model_params.input_dtype == TensorDataType.bfloat16: input_dtype = ov.Type.bf16 else: raise Exception @@ -243,6 +245,25 @@ def _build_compress_model( ) +@cache_results +def _build_compress_decompress_model( + config: WeightCompressionConfig, + ov_model_params: OVModelParameters, + weight_shape: Tuple, + scale_shape: Tuple, + zero_point_shape: Optional[Tuple] = None, +): + ov_parameters, ov_results = _build_compress_model( + config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True + ) + return _get_compress_decompress_model( + config, + ov_model_params, + ov_parameters, + ov_results, + ) + + def _get_compress_model( config: WeightCompressionConfig, ov_model_params: OVModelParameters, @@ -287,15 +308,8 @@ def _get_compress_model( model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - def infer(inputs): - infer_request = compiled_model.create_infer_request() - infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs) - outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] - if ov_model_params.release_memory: - compiled_model.release_memory() - return outputs - - return infer + run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model + return partial(run_fn, ov_model_params, compiled_model) def _get_compress_decompress_model( @@ -322,12 +336,5 @@ def _get_compress_decompress_model( model = ov.Model([decompressed_w], parameters) compiled_model = ov.compile_model(model, device_name="CPU") - def infer(inputs): - infer_request = compiled_model.create_infer_request() - infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs) - outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] - if ov_model_params.release_memory: - compiled_model.release_memory() - return outputs - - return infer + run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model + return partial(run_fn, ov_model_params, compiled_model) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 87fe07d569e..304d554b051 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -8,13 +8,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import logging +import os from dataclasses import dataclass from typing import Optional, Tuple import numpy as np import nncf +from nncf.common.logging.logger import log_once from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters @@ -23,7 +25,7 @@ from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns -from nncf.tensor.definitions import TensorDataType +from nncf.tensor.definitions import TensorDataType, TensorBackend from nncf.utils import is_openvino_available ReductionAxes = Tuple[int, ...] @@ -430,7 +432,7 @@ def do_int_dequantization( def do_int_quantization( weight: Tensor, - reduction_axes: Tuple[int, ...], + reduction_axes: ReductionAxes, config: WeightCompressionConfig, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, @@ -439,9 +441,12 @@ def do_int_quantization( ): assert config.is_integer(), "The function supports integer quantization only" - accelerate_through_ov = is_openvino_available() + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + if not is_openvino_available() and weight.backend != TensorBackend.torch: + log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") if not accelerate_through_ov: + # Reference implementation group_size = config.group_size if weight.dtype != TensorDataType.float32: @@ -462,30 +467,40 @@ def do_int_quantization( compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) return compressed_weights, scale, zero_point + import openvino as ov + weight_shape = weight.shape scale_shape = None if precomputed_scale is None else precomputed_scale.shape zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape + is_bf16 = getattr(weight, "_is_bf16", False) + input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype if ov_model_params is None: - ov_model_params = OVModelParameters() + # ov_model_params = OVModelParameters(input_dtype) + ov_model_params = OVModelParameters( + input_dtype, + dynamic=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))), + recompile=bool(int(os.environ.get("RECOMPILE", "0"))), + release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))), + share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))), + ) # TODO: Try reshaping weight before inputing it to the model if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: ov_model_params.dynamic = False - model = get_compress_weight_model( + ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, - ov_model_params, ) + weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data if precomputed_scale is None: - results = model(weight.data) - compressed_weight, scale, zero_point = [Tensor(it) for it in results] + compressed_weight, scale, zero_point = model(weight_data) else: - inputs = [weight.data, precomputed_scale.data] + inputs = [weight_data, precomputed_scale.data] if precomputed_zero_point is not None: inputs += [precomputed_zero_point.data] compressed_weight = Tensor(model(inputs)[0]) @@ -502,25 +517,33 @@ def calculate_quantized_dequantized_weight( invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None, ) -> Tensor: - accelerate_through_ov = is_openvino_available() + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + if not is_openvino_available() and weight.backend != TensorBackend.torch: + log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") if not accelerate_through_ov: + # Reference implementation compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) return decompressed_weight + import openvino as ov + weight_shape = weight.shape scale_shape = scale.shape zero_point_shape = None if zero_point is None else zero_point.shape + is_bf16 = getattr(weight, "_is_bf16", False) + input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype if ov_model_params is None: - ov_model_params = OVModelParameters() + ov_model_params = OVModelParameters(input_dtype) if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: ov_model_params.dynamic = False - model = get_compress_decompress_weight_model(config, weight_shape, scale_shape, zero_point_shape, ov_model_params) + model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape) - inputs = [weight.data, scale.data] + weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data + inputs = [weight_data, scale.data] if zero_point is not None: inputs.append(zero_point.data) results = model(inputs) diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py index 061d1ee6e66..715c963bf89 100644 --- a/nncf/tensor/functions/numeric.py +++ b/nncf/tensor/functions/numeric.py @@ -130,6 +130,27 @@ def astype(a: Tensor, data_type: TensorDataType) -> Tensor: :return: Copy of the tensor in specified type. """ + # is_bf16 = getattr(a, "_is_bf16", False) + # if is_bf16: + # def bf16_to_fp32_v2(x): + # # Step 1: Interpret the float16 data as uint16 to access the raw bits + # custom16_bits = x.view(np.uint16) # Keep as uint16 + # + # # Step 2: Allocate uint32 to hold the result (in-place modification in original variable) + # custom16_bits = custom16_bits.astype(np.uint32) # Cast to uint32 for safe shifting + # + # # Step 3: Extract and shift sign, exponent, and fraction directly into custom16_bits + # custom16_bits = (((custom16_bits & 0x8000) << 16) | # Extract and move sign bit to bit 31 + # ((custom16_bits & 0x7F80) << 16) | # Extract and move exponent to bits 30-23 + # ((custom16_bits & 0x007F) << 16)) # Extract and move fraction to bits 22-0 + # + # # Step 4: Interpret the resulting 32-bit integers as float32 + # float32_array = custom16_bits.view(np.float32) + # + # return float32_array + # + # fp32_data = bf16_to_fp32_v2(a.data) + # Tensor(astype(fp32_data, data_type)) return Tensor(astype(a.data, data_type)) diff --git a/run_weight_compression.py b/run_weight_compression.py new file mode 100644 index 00000000000..2d7211effc4 --- /dev/null +++ b/run_weight_compression.py @@ -0,0 +1,318 @@ +import os +import shutil +import subprocess +import threading +import time +from pathlib import Path + + +def stream_handler(stream, target_file): + for line in iter(stream.readline, ''): + print(line, end='') + target_file.write(line) + + +parent_model_dir = Path("/home/nsavel/workspace/models/hf") +parent_log_dir = Path("compression_logs") + +experiment_params = [ + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", ""), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + + + + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + + + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"), + # + # + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym "), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym "), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), +] + +for model_dir, log_dir, params in experiment_params: + model_path = model_dir / "openvino_model.xml" + cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}" + + log_dir.mkdir(parents=True, exist_ok=True) + with open(log_dir / "log.txt", "a") as log_file: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + universal_newlines=True, + preexec_fn=os.setsid, + ) + + stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file)) + stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file)) + + stdout_thread.start() + stderr_thread.start() + + stdout_thread.join() + stderr_thread.join() + + process.wait() + time.sleep(10) + +evaluated_paths = set() +for _, log_dir, _ in experiment_params: + for model_path in log_dir.rglob("**/*"): + model_path: Path + if model_path.suffix != ".xml": + continue + if model_path.absolute() in evaluated_paths: + continue + evaluated_paths.add(model_path.absolute()) + + model_dir = model_path.parent.absolute() + cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}" + process = subprocess.Popen(cmd, shell=True) + process.wait() diff --git a/weight_compression.py b/weight_compression.py new file mode 100644 index 00000000000..54ce0690238 --- /dev/null +++ b/weight_compression.py @@ -0,0 +1,209 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gc +import os +import shutil +import time +from functools import partial +from pathlib import Path + +import openvino as ov + +import nncf +from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE +from tools.memory_monitor import MemoryMonitor +from tools.memory_monitor import MemoryType + + +def parse_arguments(): + parser = argparse.ArgumentParser() + + parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored") + + parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved") + + parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode") + + parser.add_argument("--numpy", action="store_true", help="Enable numpy compression") + + parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models") + + parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype") + + parser.add_argument("--recompile", action="store_true", help="Recompile model every time") + + parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs") + + parser.add_argument("--save-model", action="store_true", help="Save compressed model") + + parser.add_argument("--release-memory", action="store_true", help="Release memory") + + return parser.parse_args() + + +def log(mm, fz, log_dir): + mm.save_memory_logs( + *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else "" + ) + + +def count_node_dtypes(model): + # Get the main dtype of weight constants + node_count_per_dtype = dict(f32=0, f16=0, bf16=0) + for node in model.get_ordered_ops(): + friendly_name = node.get_friendly_name() + if node.get_type_name() != "Constant" or ".weight" not in friendly_name: + continue + const_dtype = node.get_element_type().get_type_name() + if const_dtype in node_count_per_dtype: + node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1 + return node_count_per_dtype + + +def main(args): + model_path = Path(args.model_path) + log_dir = Path(args.log_dir) + + numpy_compression = args.numpy + dynamic_compression = args.dynamic + input_dtype = args.input_dtype + recompile = args.recompile + share_outputs = args.share_outputs + save_model = args.save_model + release_memory = args.release_memory + + log_dir_suffix = f"{model_path.parent.name}_" + if numpy_compression: + log_dir_suffix = f"{log_dir_suffix}numpy" + else: + log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}" + if input_dtype is not None: + log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" + if recompile: + log_dir_suffix = f"{log_dir_suffix}_recompile" + if release_memory: + log_dir_suffix = f"{log_dir_suffix}_release-memory" + if share_outputs: + log_dir_suffix = f"{log_dir_suffix}_share-outputs" + print(f"Log dir suffix: {log_dir_suffix}") + + memory_monitors = [] + for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]: + memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0)) + memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix)) + memory_monitors.append(memory_monitor) + + core = ov.Core() + # core.set_property({"ENABLE_MMAP": "NO"}) + model = core.read_model(model_path) + + node_count_per_dtype = count_node_dtypes(model) + assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type" + node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True) + model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]] + + # Update input dtype based on model + input_dtype = input_dtype or model_dtype + + os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" + os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" + os.environ["INPUT_DTYPE"] = input_dtype + os.environ["RECOMPILE"] = f"{int(recompile)}" + os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}" + os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}" + + start_time = time.perf_counter() + if args.compression_mode == "int8_asym": + compression_mode = nncf.CompressWeightsMode.INT8_ASYM + elif args.compression_mode == "int8_sym": + compression_mode = nncf.CompressWeightsMode.INT8_SYM + elif args.compression_mode == "int4_asym": + compression_mode = nncf.CompressWeightsMode.INT4_ASYM + elif args.compression_mode == "int4_sym": + compression_mode = nncf.CompressWeightsMode.INT4_SYM + else: + raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}") + compressed_model = nncf.compress_weights(model, mode=compression_mode) + compression_time = time.perf_counter() - start_time + print(f"Compression Time: {compression_time:.2f} sec.") + + if save_model: + ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml") + for filepath in model_path.parent.glob("*.json"): + shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name)) + + del core + del model + del compressed_model + gc.collect() + time.sleep(0.5) + + before_cache_deletion = memory_monitors[2].get_data(True)[1][-1] + if not COMPILED_MODEL_CACHE.is_empty(): + COMPILED_MODEL_CACHE.clear() + gc.collect() + time.sleep(memory_monitors[0].interval * 10) + after_cache_deletion = memory_monitors[2].get_data(True)[1][-1] + else: + after_cache_deletion = before_cache_deletion + cache_size = before_cache_deletion - after_cache_deletion + print(f"Cache size: {cache_size:.2f} MiB") + + time.sleep(memory_monitors[0].interval * 10) + + leftover_memory = memory_monitors[2].get_data(True)[1][-1] + peak_memory = max(memory_monitors[2].get_data(True)[1]) + print(f"Peak memory: {peak_memory:.2f} MiB") + print(f"Leftover memory: {leftover_memory:.2f} MiB") + print("Done") + + csv_path = log_dir / "results.csv" + csv_exists = csv_path.exists() + csv_path.parent.mkdir(exist_ok=True, parents=True) + with open(csv_path, "a") as f: + if not csv_exists: + f.write( + "Model Path," + "Model dtype," + "Backend," + "Recompile," + "Release memory," + "Share outputs," + "Input Shapes," + "Input," + "Compression Time," + "Peak Memory," + "Cache Size," + "Leftover Memory" + "\n" + ) + f.write( + f"{model_path}," + f"{model_dtype.upper()}," + f"{'NumPy' if numpy_compression else 'OV'}," + f"{'-' if numpy_compression else recompile}," + f"{'-' if numpy_compression else release_memory}," + f"{'-' if numpy_compression else share_outputs}," + f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'}," + f"{'-' if numpy_compression else input_dtype.upper()}," + f"{compression_time:.2f}," + f"{peak_memory:.2f}," + f"{cache_size:.2f}," + f"{leftover_memory:.2f}" + f"\n" + ) + + +if __name__ == "__main__": + args = parse_arguments() + main(args) From 166dd04c8670f05fab61822b3b4543d1744596e9 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 24 Oct 2024 13:41:47 +0200 Subject: [PATCH 04/73] Reshape weights beforehand --- .../weight_compression/openvino_modeling.py | 23 - .../weight_compression/weight_lowering.py | 13 +- run_weight_compression.py | 422 +++++++++--------- weight_compression.py | 1 + 4 files changed, 226 insertions(+), 233 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 52f7c43a167..064c842ef6f 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -130,9 +130,6 @@ def get_compress_decompress_weight_model( scale_shape: Optional[Tuple], zero_point_shape: Optional[Tuple] = None, ): - if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: - ov_model_params.dynamic = False - if ov_model_params.dynamic: weight_shape = (-1,) * len(weight_shape) scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) @@ -182,26 +179,6 @@ def _build_compress_model( else: # Compute compressed weight, scale and, possibly, zero point - group_size = config.group_size - if group_size != -1: - if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1: - reduction_axes = reduction_axes[0] - if not isinstance(reduction_axes, int): - raise NotImplementedError( - f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}." - ) - channel_size = weight.shape[reduction_axes] - if channel_size % group_size != 0: - raise nncf.ValidationError( - f"Channel size {channel_size} should be divisible by size of group {group_size}" - ) - - num_groups_per_channel = channel_size // group_size - shape = list(weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis - shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size) - weight = opset.reshape(weight, shape, special_zero=False) - reduction_axes += 1 - mode = config.mode num_bits = config.num_bits eps = np.finfo(np.float32).eps diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 304d554b051..88b3a7358b7 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -445,17 +445,16 @@ def do_int_quantization( if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") + if config.group_size != -1: + # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] + weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) + if not accelerate_through_ov: # Reference implementation - group_size = config.group_size if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - if group_size != -1: - # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] - weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) - scale, zero_point = None, None if precomputed_zero_point is None or precomputed_zero_point is None: scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) @@ -484,9 +483,7 @@ def do_int_quantization( release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))), share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))), ) - # TODO: Try reshaping weight before inputing it to the model - if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: - ov_model_params.dynamic = False + model = get_compress_weight_model( ov_model_params, config, diff --git a/run_weight_compression.py b/run_weight_compression.py index 2d7211effc4..d7eefec79ab 100644 --- a/run_weight_compression.py +++ b/run_weight_compression.py @@ -16,209 +16,227 @@ def stream_handler(stream, target_file): parent_log_dir = Path("compression_logs") experiment_params = [ - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", ""), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", ""), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"), # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - - - - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + + + + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic "), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), @@ -269,10 +287,10 @@ def stream_handler(stream, target_file): # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), ] for model_dir, log_dir, params in experiment_params: diff --git a/weight_compression.py b/weight_compression.py index 54ce0690238..5bfc3bd24d7 100644 --- a/weight_compression.py +++ b/weight_compression.py @@ -133,6 +133,7 @@ def main(args): compression_mode = nncf.CompressWeightsMode.INT4_SYM else: raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}") + # TODO: Consider all_layers=True? compressed_model = nncf.compress_weights(model, mode=compression_mode) compression_time = time.perf_counter() - start_time print(f"Compression Time: {compression_time:.2f} sec.") From edbe913558be883f67523dc57f49d65f47367315 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 25 Oct 2024 13:38:28 +0200 Subject: [PATCH 05/73] BF16 support --- nncf/openvino/graph/node_utils.py | 5 +- .../weight_compression/openvino_backend.py | 34 ++++++++++---- .../weight_compression/openvino_modeling.py | 3 +- .../weight_compression/weight_lowering.py | 47 +++++++++++-------- nncf/tensor/functions/numeric.py | 27 +++-------- nncf/tensor/functions/ov.py | 38 ++++++++++++++- nncf/tensor/tensor.py | 3 ++ 7 files changed, 101 insertions(+), 56 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 17213204268..39056d65af5 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int: return cnt_if_op(model, 0) -def get_const_value(const_node: ov.Node, bf16_to_fp32: Optional[bool] = True) -> np.ndarray: +def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray: """ Returns the constant tensor for the node. This method is applicable only for the floating-point constant data. @@ -115,8 +115,7 @@ def get_const_value(const_node: ov.Node, bf16_to_fp32: Optional[bool] = True) -> :param const_node: OpenVINO node. :return: The constant value. """ - if const_node.get_element_type() == ov.Type.bf16 and bf16_to_fp32: - # Fixed FP32 data type as the result for BF16 constant + if const_node.get_element_type() == ov.Type.bf16 and cast_bf16_to_fp32: return const_node.get_data(dtype=np.float32) return const_node.data diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 3caaaa1b4f9..49924ead6f5 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -50,7 +50,8 @@ from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor -from nncf.tensor.definitions import TensorDataType, TensorBackend +from nncf.tensor.definitions import TensorBackend +from nncf.tensor.definitions import TensorDataType class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): @@ -245,19 +246,25 @@ def _create_compression_subgraph( original_shape = weight.shape compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points) - compressed_const = self._create_ov_const_from_tensor(compressed_weight.tensor, compression_dtype, name=const_node_name) + compressed_const = self._create_ov_const_from_tensor( + compressed_weight.tensor, compression_dtype, name=const_node_name + ) if compressed_const.get_element_type() != compression_dtype: compressed_const = opset.convert(compressed_const, compression_dtype) converted_const = opset.convert(compressed_const, ov.Type.f16) if compressed_weight.zero_point is not None: - zero_point_const = self._create_ov_const_from_tensor(compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point") + zero_point_const = self._create_ov_const_from_tensor( + compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point" + ) zero_point_const = opset.convert(zero_point_const, ov.Type.f16) converted_const = opset.subtract( converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" ) - scale_const = self._create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale") + scale_const = self._create_ov_const_from_tensor( + compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale" + ) if scale_const.get_element_type() != ov.Type.f16: scale_const = opset.convert(scale_const, ov.Type.f16) @@ -291,11 +298,10 @@ def transform_model( const_node = self.name_to_node_mapping[const_node_name] const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() - weight = Tensor(get_const_value(const_node)) - # TODO: try to support bf16 by creating a Tensor with OV backend - # weight = Tensor(get_const_value(const_node, bf16_to_fp32=False)) - # if const_dtype == ov.Type.bf16: - # weight._is_bf16 = True + weight = get_const_value(const_node, cast_bf16_to_fp32=False) + if const_dtype == ov.Type.bf16: + weight = ov.Tensor(weight, weight.shape, ov.Type.bf16) + weight = Tensor(weight) should_add_convert_node = False if const_dtype != ov.Type.f16: @@ -325,6 +331,11 @@ def transform_model( target_input.replace_source_output(mul_output) if lora_correction_algo is not None and lora_correction_algo.is_applicable(wc_params): + if weight.backend == TensorBackend.ov: + if weight.dtype == TensorDataType.bfloat16: + weight = weight.astype(TensorDataType.float32) + weight = weight.to_backend(TensorBackend.numpy) + # TODO: cast int4 ov tensor too? adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params) self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters) @@ -340,12 +351,15 @@ def dump_parameters( dump_parameters(model, parameters, algo_name, path) @staticmethod - def _create_ov_const_from_tensor(x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None) -> Constant: + def _create_ov_const_from_tensor( + x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None + ) -> Constant: if x.backend == TensorBackend.ov: return opset.constant(x.data, name=name) const = opset.constant(x.data, dtype=dtype, name=name) return const + class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): @staticmethod def get_awq_patterns(): diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 064c842ef6f..cd7fb0d3ede 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -22,7 +22,8 @@ import nncf from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.tensor import TensorDataType, Tensor +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType @dataclass diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 88b3a7358b7..765ada30041 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -25,7 +25,8 @@ from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns -from nncf.tensor.definitions import TensorDataType, TensorBackend +from nncf.tensor.definitions import TensorBackend +from nncf.tensor.definitions import TensorDataType from nncf.utils import is_openvino_available ReductionAxes = Tuple[int, ...] @@ -378,6 +379,11 @@ def compress_weight( :return: The compressed weight and decompression parameters as instance of CompressedWeight """ if not config.is_integer(): + if weight.backend == TensorBackend.ov: + if weight.dtype == TensorDataType.bfloat16: + weight = weight.astype(TensorDataType.float32) + weight = weight.to_backend(TensorBackend.numpy) + compressed_weight, scale = calculate_normalized_weight_and_fp4_scale( weight, reduction_axes, config.group_size, precomputed_scale, config.mode ) @@ -441,7 +447,11 @@ def do_int_quantization( ): assert config.is_integer(), "The function supports integer quantization only" - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + accelerate_through_ov = ( + is_openvino_available() + and weight.backend != TensorBackend.torch + and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + ) if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") @@ -452,6 +462,11 @@ def do_int_quantization( if not accelerate_through_ov: # Reference implementation + if weight.backend == TensorBackend.ov: + if weight.dtype == TensorDataType.bfloat16: + weight = weight.astype(TensorDataType.float32) + weight = weight.to_backend(TensorBackend.numpy) + if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) @@ -466,18 +481,14 @@ def do_int_quantization( compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) return compressed_weights, scale, zero_point - import openvino as ov - weight_shape = weight.shape scale_shape = None if precomputed_scale is None else precomputed_scale.shape zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape - is_bf16 = getattr(weight, "_is_bf16", False) - input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype if ov_model_params is None: - # ov_model_params = OVModelParameters(input_dtype) + # ov_model_params = OVModelParameters(weight.dtype) ov_model_params = OVModelParameters( - input_dtype, + weight.dtype, dynamic=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))), recompile=bool(int(os.environ.get("RECOMPILE", "0"))), release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))), @@ -493,11 +504,10 @@ def do_int_quantization( reduction_axes, ) - weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data if precomputed_scale is None: - compressed_weight, scale, zero_point = model(weight_data) + compressed_weight, scale, zero_point = model(weight.data) else: - inputs = [weight_data, precomputed_scale.data] + inputs = [weight.data, precomputed_scale.data] if precomputed_zero_point is not None: inputs += [precomputed_zero_point.data] compressed_weight = Tensor(model(inputs)[0]) @@ -514,7 +524,11 @@ def calculate_quantized_dequantized_weight( invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None, ) -> Tensor: - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + accelerate_through_ov = ( + is_openvino_available() + and weight.backend != TensorBackend.torch + and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + ) if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") @@ -524,23 +538,18 @@ def calculate_quantized_dequantized_weight( decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) return decompressed_weight - import openvino as ov - weight_shape = weight.shape scale_shape = scale.shape zero_point_shape = None if zero_point is None else zero_point.shape - is_bf16 = getattr(weight, "_is_bf16", False) - input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype if ov_model_params is None: - ov_model_params = OVModelParameters(input_dtype) + ov_model_params = OVModelParameters(weight.dtype) if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: ov_model_params.dynamic = False model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape) - weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data - inputs = [weight_data, scale.data] + inputs = [weight.data, scale.data] if zero_point is not None: inputs.append(zero_point.data) results = model(inputs) diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py index 715c963bf89..cdec5788bf6 100644 --- a/nncf/tensor/functions/numeric.py +++ b/nncf/tensor/functions/numeric.py @@ -130,27 +130,6 @@ def astype(a: Tensor, data_type: TensorDataType) -> Tensor: :return: Copy of the tensor in specified type. """ - # is_bf16 = getattr(a, "_is_bf16", False) - # if is_bf16: - # def bf16_to_fp32_v2(x): - # # Step 1: Interpret the float16 data as uint16 to access the raw bits - # custom16_bits = x.view(np.uint16) # Keep as uint16 - # - # # Step 2: Allocate uint32 to hold the result (in-place modification in original variable) - # custom16_bits = custom16_bits.astype(np.uint32) # Cast to uint32 for safe shifting - # - # # Step 3: Extract and shift sign, exponent, and fraction directly into custom16_bits - # custom16_bits = (((custom16_bits & 0x8000) << 16) | # Extract and move sign bit to bit 31 - # ((custom16_bits & 0x7F80) << 16) | # Extract and move exponent to bits 30-23 - # ((custom16_bits & 0x007F) << 16)) # Extract and move fraction to bits 22-0 - # - # # Step 4: Interpret the resulting 32-bit integers as float32 - # float32_array = custom16_bits.view(np.float32) - # - # return float32_array - # - # fp32_data = bf16_to_fp32_v2(a.data) - # Tensor(astype(fp32_data, data_type)) return Tensor(astype(a.data, data_type)) @@ -926,3 +905,9 @@ def ceil(a: Tensor) -> Tensor: :return: An array of the same type as a, containing the ceiling values. """ return Tensor(ceil(a.data)) + + +@functools.singledispatch +@tensor_guard +def to_backend(a: Tensor, b: TensorBackend) -> Tensor: + return Tensor(to_backend(a.data, b)) diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index cd094e7a0e0..fbc28418fb9 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -8,6 +8,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Tuple, Union import numpy as np import openvino as ov @@ -32,14 +33,35 @@ DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()} +def _bf16_to_fp32(a: ov.Tensor) -> ov.Tensor: + assert a.get_element_type() == ov.Type.bf16 and a.data.dtype == np.float16 + + a = a.data.view(np.uint16) + + res = a.astype(np.uint32) + res = ( + ((res & 0x8000) << 16) # Move sign bit to bit 31 + | ((res & 0x7F80) << 16) # Move exponent to bits 30-23 + | ((res & 0x007F) << 16) + ) # Move fraction to bits 22-0 + res = res.view(np.float32) + + res = ov.Tensor(res) + return res + + @numeric.backend.register(ov.Tensor) def _(a: ov.Tensor) -> TensorBackend: return TensorBackend.ov @numeric.astype.register(ov.Tensor) -def _(a: ov.Tensor, dtype: TensorDataType) -> np.ndarray: - return a.data.astype(NP_DTYPE_MAP[dtype]) +def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: + if dtype == TensorDataType.bfloat16: + raise ValueError("Not supported conversion") + if a.get_element_type() == ov.Type.bf16: + a = _bf16_to_fp32(a) + return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype])) @numeric.dtype.register(ov.Tensor) @@ -50,3 +72,15 @@ def _(a: ov.Tensor) -> TensorDataType: @numeric.size.register(ov.Tensor) def _(a: ov.Tensor) -> int: return a.size + + +@numeric.reshape.register(ov.Tensor) +def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor: + return ov.Tensor(a.data.reshape(shape), shape, a.get_element_type()) + + +@numeric.to_backend.register(ov.Tensor) +def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray: + if b != TensorBackend.numpy: + raise ValueError("Not supported backend") + return a.data diff --git a/nncf/tensor/tensor.py b/nncf/tensor/tensor.py index 52966be1ad1..1f776e19ad6 100644 --- a/nncf/tensor/tensor.py +++ b/nncf/tensor/tensor.py @@ -194,6 +194,9 @@ def item(self) -> float: def clone(self) -> float: return _call_function("clone", self) + def to_backend(self, backend: TensorBackend) -> Tensor: + return _call_function("to_backend", self, backend) + def _call_function(func_name: str, *args): """ From b636c667a73b3560eb964812bcff9142c7145277 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 25 Oct 2024 13:43:39 +0200 Subject: [PATCH 06/73] Tweak lora type hint --- .../algorithms/weight_compression/lora_correction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/lora_correction.py b/nncf/quantization/algorithms/weight_compression/lora_correction.py index 0c9bb3409ba..212eb5e79fb 100644 --- a/nncf/quantization/algorithms/weight_compression/lora_correction.py +++ b/nncf/quantization/algorithms/weight_compression/lora_correction.py @@ -24,7 +24,7 @@ from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization, CompressedWeight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization from nncf.tensor import Tensor @@ -105,7 +105,7 @@ def is_applicable(self, wc_params: WeightCompressionParameters): return wc_params.compression_config.num_bits == 4 def calculate_adapters( - self, weight: Tensor, compressed_weight: Tensor, wc_params: WeightCompressionParameters + self, weight: Tensor, compressed_weight: CompressedWeight, wc_params: WeightCompressionParameters ) -> Tuple[Tensor, Tensor, List[float]]: """ Calculates low rank matrices for a given original and compressed weights. @@ -134,7 +134,7 @@ def calculate_adapters( @staticmethod def calculate_low_rank_matrices( weight: Tensor, - compressed_weight: Tensor, + compressed_weight: CompressedWeight, compression_config: WeightCompressionConfig, reduction_axes: Tuple[int, ...], lora_correction_params: AdvancedLoraCorrectionParameters, From f0129efecd8e1bf7e43a330e1ab610e4acddd50e Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 25 Oct 2024 14:03:19 +0200 Subject: [PATCH 07/73] Tweaks --- .../algorithms/weight_compression/openvino_modeling.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index cd7fb0d3ede..16d16b7314c 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -258,13 +258,11 @@ def _get_compress_model( num_bits = config.num_bits if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - # dtype = ov.Type.u8 dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 level_low = 0 level_high = 2**num_bits - 1 - compressed_w += opset.convert(zp, ov.Type.f32) + compressed_w += zp if zp.get_element_type() == ov.Type.f32 else opset.convert(zp, ov.Type.f32) elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: - # dtype = ov.Type.i8 dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4 level_low = -(2 ** (num_bits - 1)) level_high = 2 ** (num_bits - 1) - 1 From e887e70b5da19ceaa7bc29f27e7d4a06baccb9cd Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 25 Oct 2024 16:08:34 +0200 Subject: [PATCH 08/73] Added share_inputs --- .../weight_compression/openvino_backend.py | 9 +++++--- .../weight_compression/openvino_modeling.py | 23 ++++++++++--------- .../weight_compression/weight_lowering.py | 5 +++- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 49924ead6f5..840170f4f8b 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -330,6 +330,10 @@ def transform_model( for target_input in const_node.output(0).get_target_inputs(): target_input.replace_source_output(mul_output) + # if compressed_weight.tensor.backend == TensorBackend.ov: + # if compressed_weight.tensor.dtype == TensorDataType.uint4: + # compressed_weight.tensor = compressed_weight.tensor.astype(TensorDataType.uint8) + # compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy) if lora_correction_algo is not None and lora_correction_algo.is_applicable(wc_params): if weight.backend == TensorBackend.ov: if weight.dtype == TensorDataType.bfloat16: @@ -351,10 +355,9 @@ def dump_parameters( dump_parameters(model, parameters, algo_name, path) @staticmethod - def _create_ov_const_from_tensor( - x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None - ) -> Constant: + def _create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant: if x.backend == TensorBackend.ov: + assert x.data.get_element_type() == dtype return opset.constant(x.data, name=name) const = opset.constant(x.data, dtype=dtype, name=name) return const diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 16d16b7314c..5bc302c50b8 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -10,7 +10,6 @@ # limitations under the License. import inspect -import os from dataclasses import dataclass from functools import partial from typing import List, Optional, Tuple @@ -19,7 +18,6 @@ import openvino as ov from openvino.runtime import opset13 as opset -import nncf from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.tensor import Tensor @@ -29,13 +27,14 @@ @dataclass class OVModelParameters: input_dtype: TensorDataType - dynamic: bool = False + dynamic_shapes: bool = False recompile: bool = False release_memory: bool = True + share_inputs: bool = True share_outputs: bool = True def __hash__(self): - return hash((self.input_dtype, self.dynamic, self.recompile, self.release_memory, self.share_outputs)) + return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs)) class CompiledModelCache: @@ -57,7 +56,7 @@ def clear_cache(): def cache_results(func): - def wrapper(*args, **kwargs): + def wrapper(*args, disable_caching=False, **kwargs): sig = inspect.signature(func) new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} new_kwargs.update(kwargs) @@ -66,8 +65,7 @@ def wrapper(*args, **kwargs): if cache_key in cache: return cache[cache_key] result = func(*args, **kwargs) - recompile = new_kwargs["ov_model_params"].recompile - if not recompile: + if not disable_caching: cache[cache_key] = result return result @@ -76,7 +74,7 @@ def wrapper(*args, **kwargs): def run_model(ov_model_params, compiled_model, inputs): # Returns results as numpy tensors - outputs = compiled_model(inputs, share_outputs=ov_model_params.share_outputs) + outputs = compiled_model(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) outputs = [Tensor(outputs[i]) for i in range(len(outputs))] if ov_model_params.release_memory: compiled_model.release_memory() @@ -86,7 +84,8 @@ def run_model(ov_model_params, compiled_model, inputs): def run_model_via_infer_request(ov_model_params, compiled_model, inputs): # Returns results as ov tensors infer_request = compiled_model.create_infer_request() - infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs) + # TODO: try share_inputs=True + infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))] if ov_model_params.release_memory: compiled_model.release_memory() @@ -106,7 +105,7 @@ def get_compress_weight_model( # if (scale_shape is None) != (reduction_axes is not None): # raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.") - if ov_model_params.dynamic: + if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) if scale_shape is not None: scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) @@ -121,6 +120,7 @@ def get_compress_weight_model( zero_point_shape, reduction_axes, return_nodes=False, + disable_caching=ov_model_params.recompile, ) @@ -131,7 +131,7 @@ def get_compress_decompress_weight_model( scale_shape: Optional[Tuple], zero_point_shape: Optional[Tuple] = None, ): - if ov_model_params.dynamic: + if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) if zero_point_shape is not None: @@ -143,6 +143,7 @@ def get_compress_decompress_weight_model( weight_shape, scale_shape, zero_point_shape, + disable_caching=ov_model_params.recompile, ) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 765ada30041..dfc369b0c9a 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -489,7 +489,7 @@ def do_int_quantization( # ov_model_params = OVModelParameters(weight.dtype) ov_model_params = OVModelParameters( weight.dtype, - dynamic=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))), + dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))), recompile=bool(int(os.environ.get("RECOMPILE", "0"))), release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))), share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))), @@ -506,6 +506,9 @@ def do_int_quantization( if precomputed_scale is None: compressed_weight, scale, zero_point = model(weight.data) + # Scale is always in fp32 so there is no need to store it in ov.Tensor + if scale.backend == TensorBackend.ov: + scale = scale.to_backend(TensorBackend.numpy) else: inputs = [weight.data, precomputed_scale.data] if precomputed_zero_point is not None: From 9141a8a6ec7ee595fbadef488a11f1b64cbd483d Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 25 Oct 2024 16:50:26 +0200 Subject: [PATCH 09/73] Modeling tweaks --- .../weight_compression/openvino_modeling.py | 119 +++++++----------- .../weight_compression/weight_lowering.py | 15 ++- 2 files changed, 53 insertions(+), 81 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 5bc302c50b8..f419995959d 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -12,7 +12,7 @@ import inspect from dataclasses import dataclass from functools import partial -from typing import List, Optional, Tuple +from typing import Optional, Tuple, Callable, List import numpy as np import openvino as ov @@ -23,6 +23,9 @@ from nncf.tensor import Tensor from nncf.tensor import TensorDataType +TensorList = List[Tensor] +ModelCallable = Callable[[TensorList], TensorList] + @dataclass class OVModelParameters: @@ -32,10 +35,10 @@ class OVModelParameters: release_memory: bool = True share_inputs: bool = True share_outputs: bool = True + return_ov_tensors: bool = False def __hash__(self): - return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs)) - + return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs, self.return_ov_tensors)) class CompiledModelCache: def __init__(self): @@ -72,8 +75,9 @@ def wrapper(*args, disable_caching=False, **kwargs): return wrapper -def run_model(ov_model_params, compiled_model, inputs): +def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList: # Returns results as numpy tensors + inputs = [inp.data for inp in inputs] outputs = compiled_model(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) outputs = [Tensor(outputs[i]) for i in range(len(outputs))] if ov_model_params.release_memory: @@ -81,10 +85,10 @@ def run_model(ov_model_params, compiled_model, inputs): return outputs -def run_model_via_infer_request(ov_model_params, compiled_model, inputs): +def run_model_via_infer_request(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList: # Returns results as ov tensors + inputs = [inp.data for inp in inputs] infer_request = compiled_model.create_infer_request() - # TODO: try share_inputs=True infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))] if ov_model_params.release_memory: @@ -99,7 +103,7 @@ def get_compress_weight_model( scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, -): +) -> ModelCallable: if scale_shape is None and zero_point_shape is not None: raise Exception("Zero point shape can only be provided if scale shape is provided.") # if (scale_shape is None) != (reduction_axes is not None): @@ -112,6 +116,8 @@ def get_compress_weight_model( if zero_point_shape is not None: zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + ov_model_params.return_ov_tensors = config.num_bits == 4 + return _build_compress_model( config, ov_model_params, @@ -130,7 +136,7 @@ def get_compress_decompress_weight_model( weight_shape: Tuple, scale_shape: Optional[Tuple], zero_point_shape: Optional[Tuple] = None, -): +) -> ModelCallable: if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) @@ -156,7 +162,7 @@ def _build_compress_model( zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, -): +) -> ModelCallable: if ov_model_params.input_dtype == TensorDataType.float32: input_dtype = ov.Type.f32 elif ov_model_params.input_dtype == TensorDataType.float16: @@ -178,6 +184,7 @@ def _build_compress_model( if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) ov_parameters.append(zero_point) + zero_point = opset.convert(zero_point, ov.Type.f32) else: # Compute compressed weight, scale and, possibly, zero point @@ -213,56 +220,16 @@ def _build_compress_model( scale /= level_high scale = opset.select(opset.abs(scale) < eps, eps, scale) - return _get_compress_model( - config, - ov_model_params, - ov_parameters, - weight, - scale, - zero_point, - return_nodes, - ) - - -@cache_results -def _build_compress_decompress_model( - config: WeightCompressionConfig, - ov_model_params: OVModelParameters, - weight_shape: Tuple, - scale_shape: Tuple, - zero_point_shape: Optional[Tuple] = None, -): - ov_parameters, ov_results = _build_compress_model( - config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True - ) - return _get_compress_decompress_model( - config, - ov_model_params, - ov_parameters, - ov_results, - ) - - -def _get_compress_model( - config: WeightCompressionConfig, - ov_model_params: OVModelParameters, - ov_parameters: List[ov._pyopenvino.op.Parameter], - w: ov.runtime.Node, - s: ov.runtime.Node, - zp: Optional[ov.runtime.Node] = None, - return_nodes: Optional[bool] = False, -): - if w.get_element_type() != ov.Type.f32: - w = opset.convert(w, ov.Type.f32) - - compressed_w = w / s + if weight.get_element_type() != ov.Type.f32: + weight = opset.convert(weight, ov.Type.f32) + compressed_w = weight / scale num_bits = config.num_bits if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 level_low = 0 - level_high = 2**num_bits - 1 - compressed_w += zp if zp.get_element_type() == ov.Type.f32 else opset.convert(zp, ov.Type.f32) + level_high = 2 ** num_bits - 1 + compressed_w += zero_point elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4 level_low = -(2 ** (num_bits - 1)) @@ -275,9 +242,9 @@ def _get_compress_model( ov_results = [compressed_w] if len(ov_parameters) == 1: - ov_results.append(s) - if zp is not None: - ov_results.append(opset.convert(zp, compressed_w.get_element_type())) + ov_results.append(scale) + if zero_point is not None: + ov_results.append(opset.convert(zero_point, compressed_w.get_element_type())) if return_nodes: return ov_parameters, ov_results @@ -285,33 +252,39 @@ def _get_compress_model( model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model + run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model return partial(run_fn, ov_model_params, compiled_model) -def _get_compress_decompress_model( +@cache_results +def _build_compress_decompress_model( config: WeightCompressionConfig, ov_model_params: OVModelParameters, - parameters: List[ov._pyopenvino.op.Parameter], - results: List[ov._pyopenvino.Node], -): - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - if len(results) == 1: - compressed_w = results[0] - s, zp = parameters[1], parameters[2] + weight_shape: Tuple, + scale_shape: Tuple, + zero_point_shape: Optional[Tuple] = None, +) -> ModelCallable: + ov_parameters, ov_results = _build_compress_model( + config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True + ) + + if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + if len(ov_results) == 1: + compressed_w = ov_results[0] + s, zp = ov_parameters[1], ov_parameters[2] else: - compressed_w, s, zp = results + compressed_w, s, zp = ov_results decompressed_w = (compressed_w - zp) * s else: - if len(results) == 1: - compressed_w = results[0] - s = parameters[1] + if len(ov_results) == 1: + compressed_w = ov_results[0] + s = ov_parameters[1] else: - compressed_w, s = results + compressed_w, s = ov_results decompressed_w = compressed_w * s - model = ov.Model([decompressed_w], parameters) + model = ov.Model([decompressed_w], ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model + run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model return partial(run_fn, ov_model_params, compiled_model) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index dfc369b0c9a..3cc476f7c97 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -505,15 +505,15 @@ def do_int_quantization( ) if precomputed_scale is None: - compressed_weight, scale, zero_point = model(weight.data) + compressed_weight, scale, zero_point = model([weight]) # Scale is always in fp32 so there is no need to store it in ov.Tensor if scale.backend == TensorBackend.ov: scale = scale.to_backend(TensorBackend.numpy) else: - inputs = [weight.data, precomputed_scale.data] + inputs = [weight, precomputed_scale] if precomputed_zero_point is not None: - inputs += [precomputed_zero_point.data] - compressed_weight = Tensor(model(inputs)[0]) + inputs += [precomputed_zero_point] + compressed_weight = model(inputs)[0] scale, zero_point = precomputed_scale, precomputed_zero_point return compressed_weight, scale, zero_point @@ -552,9 +552,8 @@ def calculate_quantized_dequantized_weight( model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape) - inputs = [weight.data, scale.data] + inputs = [weight, scale] if zero_point is not None: - inputs.append(zero_point.data) - results = model(inputs) - decompressed_weight = [Tensor(it) for it in results][0] + inputs.append(zero_point) + decompressed_weight = model(inputs)[0] return decompressed_weight From a43c5142d897c0842afd2b74ac26fcdff4b40212 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 25 Oct 2024 16:59:00 +0200 Subject: [PATCH 10/73] Move results_cache into separate file --- .../weight_compression/lora_correction.py | 3 +- .../weight_compression/openvino_modeling.py | 66 +++++++------------ nncf/results_caching.py | 39 +++++++++++ weight_compression.py | 6 +- 4 files changed, 69 insertions(+), 45 deletions(-) create mode 100644 nncf/results_caching.py diff --git a/nncf/quantization/algorithms/weight_compression/lora_correction.py b/nncf/quantization/algorithms/weight_compression/lora_correction.py index 212eb5e79fb..18167b9704e 100644 --- a/nncf/quantization/algorithms/weight_compression/lora_correction.py +++ b/nncf/quantization/algorithms/weight_compression/lora_correction.py @@ -24,7 +24,8 @@ from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization, CompressedWeight +from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization from nncf.tensor import Tensor diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index f419995959d..1aa5ccf65f5 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -9,10 +9,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect from dataclasses import dataclass from functools import partial -from typing import Optional, Tuple, Callable, List +from typing import Callable, List, Optional, Tuple import numpy as np import openvino as ov @@ -20,6 +19,8 @@ from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.results_caching import ResultsCacheContainer +from nncf.results_caching import cache_results from nncf.tensor import Tensor from nncf.tensor import TensorDataType @@ -27,6 +28,9 @@ ModelCallable = Callable[[TensorList], TensorList] +OV_MODEL_CACHE = ResultsCacheContainer() + + @dataclass class OVModelParameters: input_dtype: TensorDataType @@ -38,54 +42,34 @@ class OVModelParameters: return_ov_tensors: bool = False def __hash__(self): - return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs, self.return_ov_tensors)) - -class CompiledModelCache: - def __init__(self): - self._cache = {} - - def clear(self): - self._cache.clear() - - def is_empty(self): - return len(self._cache) == 0 - - -COMPILED_MODEL_CACHE = CompiledModelCache() - - -def clear_cache(): - COMPILED_MODEL_CACHE.clear() - - -def cache_results(func): - def wrapper(*args, disable_caching=False, **kwargs): - sig = inspect.signature(func) - new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} - new_kwargs.update(kwargs) - cache_key = (func.__name__, frozenset(new_kwargs.items())) - cache = COMPILED_MODEL_CACHE._cache - if cache_key in cache: - return cache[cache_key] - result = func(*args, **kwargs) - if not disable_caching: - cache[cache_key] = result - return result - - return wrapper + return hash( + ( + self.input_dtype, + self.dynamic_shapes, + self.recompile, + self.release_memory, + self.share_inputs, + self.share_outputs, + self.return_ov_tensors, + ) + ) def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList: # Returns results as numpy tensors inputs = [inp.data for inp in inputs] - outputs = compiled_model(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) + outputs = compiled_model( + inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs + ) outputs = [Tensor(outputs[i]) for i in range(len(outputs))] if ov_model_params.release_memory: compiled_model.release_memory() return outputs -def run_model_via_infer_request(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList: +def run_model_via_infer_request( + ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList +) -> TensorList: # Returns results as ov tensors inputs = [inp.data for inp in inputs] infer_request = compiled_model.create_infer_request() @@ -153,7 +137,7 @@ def get_compress_decompress_weight_model( ) -@cache_results +@cache_results(OV_MODEL_CACHE) def _build_compress_model( config: WeightCompressionConfig, ov_model_params: OVModelParameters, @@ -256,7 +240,7 @@ def _build_compress_model( return partial(run_fn, ov_model_params, compiled_model) -@cache_results +@cache_results(OV_MODEL_CACHE) def _build_compress_decompress_model( config: WeightCompressionConfig, ov_model_params: OVModelParameters, diff --git a/nncf/results_caching.py b/nncf/results_caching.py new file mode 100644 index 00000000000..447ed3966dd --- /dev/null +++ b/nncf/results_caching.py @@ -0,0 +1,39 @@ +import inspect + + +class ResultsCacheContainer: + def __init__(self): + self._cache = {} + + def clear(self): + self._cache.clear() + + def is_empty(self): + return len(self._cache) == 0 + + def __getitem__(self, item): + return self._cache[item] + + def __setitem__(self, key, value): + self._cache[key] = value + + def __contains__(self, item): + return item in self._cache + + +def cache_results(cache: ResultsCacheContainer): + def decorator(func): + def wrapper(*args, disable_caching=False, **kwargs): + sig = inspect.signature(func) + new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} + new_kwargs.update(kwargs) + cache_key = (func.__name__, frozenset(new_kwargs.items())) + if cache_key in cache: + return cache[cache_key] + result = func(*args, **kwargs) + if not disable_caching: + cache[cache_key] = result + return result + + return wrapper + return decorator diff --git a/weight_compression.py b/weight_compression.py index 5bfc3bd24d7..245016e8035 100644 --- a/weight_compression.py +++ b/weight_compression.py @@ -20,7 +20,7 @@ import openvino as ov import nncf -from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE from tools.memory_monitor import MemoryMonitor from tools.memory_monitor import MemoryType @@ -150,8 +150,8 @@ def main(args): time.sleep(0.5) before_cache_deletion = memory_monitors[2].get_data(True)[1][-1] - if not COMPILED_MODEL_CACHE.is_empty(): - COMPILED_MODEL_CACHE.clear() + if not OV_MODEL_CACHE.is_empty(): + OV_MODEL_CACHE.clear() gc.collect() time.sleep(memory_monitors[0].interval * 10) after_cache_deletion = memory_monitors[2].get_data(True)[1][-1] From 1216f65573c2cd9f71b83ca8cf2612e84e430b8b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 25 Oct 2024 17:43:43 +0200 Subject: [PATCH 11/73] Implement astype for ov backend for bf16, u4, i4 --- .../weight_compression/openvino_backend.py | 12 ++--- .../weight_compression/openvino_modeling.py | 39 +++++++++----- .../weight_compression/weight_lowering.py | 4 -- nncf/results_caching.py | 1 + nncf/tensor/definitions.py | 2 + nncf/tensor/functions/ov.py | 53 +++++++++++++------ 6 files changed, 69 insertions(+), 42 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 840170f4f8b..5c328e372b0 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -329,17 +329,13 @@ def transform_model( mul_output = mul.output(0) for target_input in const_node.output(0).get_target_inputs(): target_input.replace_source_output(mul_output) - - # if compressed_weight.tensor.backend == TensorBackend.ov: - # if compressed_weight.tensor.dtype == TensorDataType.uint4: - # compressed_weight.tensor = compressed_weight.tensor.astype(TensorDataType.uint8) - # compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy) if lora_correction_algo is not None and lora_correction_algo.is_applicable(wc_params): if weight.backend == TensorBackend.ov: - if weight.dtype == TensorDataType.bfloat16: - weight = weight.astype(TensorDataType.float32) weight = weight.to_backend(TensorBackend.numpy) - # TODO: cast int4 ov tensor too? + if compressed_weight.tensor.backend == TensorBackend.ov: + compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy) + if compressed_weight.zero_point.backend == TensorBackend.ov: + compressed_weight.zero_point = compressed_weight.zero_point.to_backend(TensorBackend.numpy) adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params) self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 1aa5ccf65f5..1008e872ba7 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -23,6 +23,7 @@ from nncf.results_caching import cache_results from nncf.tensor import Tensor from nncf.tensor import TensorDataType +from nncf.tensor.functions.ov import DTYPE_MAP as OV_DTYPE_MAP TensorList = List[Tensor] ModelCallable = Callable[[TensorList], TensorList] @@ -57,7 +58,8 @@ def __hash__(self): def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList: # Returns results as numpy tensors - inputs = [inp.data for inp in inputs] + if any(isinstance(it, Tensor) for it in inputs): + inputs = [inp.data for inp in inputs] outputs = compiled_model( inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs ) @@ -71,7 +73,8 @@ def run_model_via_infer_request( ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList ) -> TensorList: # Returns results as ov tensors - inputs = [inp.data for inp in inputs] + if any(isinstance(it, Tensor) for it in inputs): + inputs = [inp.data for inp in inputs] infer_request = compiled_model.create_infer_request() infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))] @@ -100,7 +103,8 @@ def get_compress_weight_model( if zero_point_shape is not None: zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) - ov_model_params.return_ov_tensors = config.num_bits == 4 + if config.num_bits == 4: + ov_model_params.return_ov_tensors = True return _build_compress_model( config, @@ -147,15 +151,7 @@ def _build_compress_model( reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, ) -> ModelCallable: - if ov_model_params.input_dtype == TensorDataType.float32: - input_dtype = ov.Type.f32 - elif ov_model_params.input_dtype == TensorDataType.float16: - input_dtype = ov.Type.f16 - elif ov_model_params.input_dtype == TensorDataType.bfloat16: - input_dtype = ov.Type.bf16 - else: - raise Exception - weight = opset.parameter(weight_shape, name="w", dtype=input_dtype) + weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype]) ov_parameters = [weight] if scale_shape is not None: @@ -212,7 +208,7 @@ def _build_compress_model( if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 level_low = 0 - level_high = 2 ** num_bits - 1 + level_high = 2**num_bits - 1 compressed_w += zero_point elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4 @@ -272,3 +268,20 @@ def _build_compress_decompress_model( run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model return partial(run_fn, ov_model_params, compiled_model) + + +def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable: + if ov_model_params.dynamic_shapes: + arg_shape = (-1,) * len(arg_shape) + return _build_astype_model(ov_model_params, arg_shape, dtype) + + +@cache_results(OV_MODEL_CACHE) +def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable: + arg = opset.parameter(arg_shape, dtype=OV_DTYPE_MAP[ov_model_params.input_dtype]) + res = opset.convert(arg, OV_DTYPE_MAP[dtype]) + model = ov.Model([res], [arg]) + compiled_model = ov.compile_model(model, device_name="CPU") + + run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model + return partial(run_fn, ov_model_params, compiled_model) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 3cc476f7c97..3af76eed391 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -380,8 +380,6 @@ def compress_weight( """ if not config.is_integer(): if weight.backend == TensorBackend.ov: - if weight.dtype == TensorDataType.bfloat16: - weight = weight.astype(TensorDataType.float32) weight = weight.to_backend(TensorBackend.numpy) compressed_weight, scale = calculate_normalized_weight_and_fp4_scale( @@ -463,8 +461,6 @@ def do_int_quantization( # Reference implementation if weight.backend == TensorBackend.ov: - if weight.dtype == TensorDataType.bfloat16: - weight = weight.astype(TensorDataType.float32) weight = weight.to_backend(TensorBackend.numpy) if weight.dtype != TensorDataType.float32: diff --git a/nncf/results_caching.py b/nncf/results_caching.py index 447ed3966dd..4a991a36be7 100644 --- a/nncf/results_caching.py +++ b/nncf/results_caching.py @@ -36,4 +36,5 @@ def wrapper(*args, disable_caching=False, **kwargs): return result return wrapper + return decorator diff --git a/nncf/tensor/definitions.py b/nncf/tensor/definitions.py index a4849e558e3..67b3bf7ed5e 100644 --- a/nncf/tensor/definitions.py +++ b/nncf/tensor/definitions.py @@ -36,6 +36,8 @@ class TensorDataType(Enum): int32 = auto() int64 = auto() uint8 = auto() + uint4 = auto() + int4 = auto() def is_float(self): """ diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index fbc28418fb9..f8cd0431f83 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -28,26 +28,34 @@ TensorDataType.int32: ov.Type.i32, TensorDataType.int64: ov.Type.i64, TensorDataType.uint8: ov.Type.u8, + TensorDataType.uint4: ov.Type.u4, + TensorDataType.int4: ov.Type.i4, } DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()} -def _bf16_to_fp32(a: ov.Tensor) -> ov.Tensor: - assert a.get_element_type() == ov.Type.bf16 and a.data.dtype == np.float16 +def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: + from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters + from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model - a = a.data.view(np.uint16) + a_dtype = DTYPE_MAP_REV[a.get_element_type()] + assert a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4] - res = a.astype(np.uint32) - res = ( - ((res & 0x8000) << 16) # Move sign bit to bit 31 - | ((res & 0x7F80) << 16) # Move exponent to bits 30-23 - | ((res & 0x007F) << 16) - ) # Move fraction to bits 22-0 - res = res.view(np.float32) - - res = ov.Tensor(res) - return res + model = get_astype_model( + OVModelParameters( + input_dtype=a_dtype, + dynamic_shapes=True, + recompile=False, + release_memory=True, + share_inputs=True, + share_outputs=True, + return_ov_tensors=True, + ), + a.shape, + dtype, + ) + return model([a])[0].data @numeric.backend.register(ov.Tensor) @@ -57,10 +65,10 @@ def _(a: ov.Tensor) -> TensorBackend: @numeric.astype.register(ov.Tensor) def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: - if dtype == TensorDataType.bfloat16: - raise ValueError("Not supported conversion") - if a.get_element_type() == ov.Type.bf16: - a = _bf16_to_fp32(a) + a_dtype = DTYPE_MAP_REV[a.get_element_type()] + if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]: + return _ov_astype(a, dtype) + return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype])) @@ -83,4 +91,15 @@ def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor: def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray: if b != TensorBackend.numpy: raise ValueError("Not supported backend") + + # Cannot convert bfloat16, uint4, int4 to numpy directly + a_dtype = DTYPE_MAP_REV[a.get_element_type()] + if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]: + dtype = TensorDataType.float32 + if a_dtype == TensorDataType.uint4: + dtype = TensorDataType.uint8 + elif a_dtype == TensorDataType.int4: + dtype = TensorDataType.int8 + a = _ov_astype(a, dtype) + return a.data From 8611b75dc5340e273cfa63a3ce696925aec2e877 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Sat, 26 Oct 2024 15:38:22 +0200 Subject: [PATCH 12/73] Experiments --- run_weight_compression.py | 497 ++++++++++++++++++++------------------ weight_compression.py | 2 +- 2 files changed, 268 insertions(+), 231 deletions(-) diff --git a/run_weight_compression.py b/run_weight_compression.py index d7eefec79ab..74d752ef4de 100644 --- a/run_weight_compression.py +++ b/run_weight_compression.py @@ -16,227 +16,264 @@ def stream_handler(stream, target_file): parent_log_dir = Path("compression_logs") experiment_params = [ - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", ""), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - - - - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic "), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"), + + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"), + + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"), + # # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + # # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), + + + + + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"), + (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), + + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), @@ -264,21 +301,21 @@ def stream_handler(stream, target_file): # # # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym"), # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), # # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym "), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym"), # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), # # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym "), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym"), # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), @@ -286,11 +323,11 @@ def stream_handler(stream, target_file): # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), ] for model_dir, log_dir, params in experiment_params: @@ -322,7 +359,7 @@ def stream_handler(stream, target_file): evaluated_paths = set() for _, log_dir, _ in experiment_params: - for model_path in log_dir.rglob("**/*"): + for model_path in sorted(log_dir.rglob("**/*")): model_path: Path if model_path.suffix != ".xml": continue diff --git a/weight_compression.py b/weight_compression.py index 245016e8035..bae1948145c 100644 --- a/weight_compression.py +++ b/weight_compression.py @@ -191,7 +191,7 @@ def main(args): f.write( f"{model_path}," f"{model_dtype.upper()}," - f"{'NumPy' if numpy_compression else 'OV'}," + f"{'-' if numpy_compression else 'OV'}," f"{'-' if numpy_compression else recompile}," f"{'-' if numpy_compression else release_memory}," f"{'-' if numpy_compression else share_outputs}," From 071866834cb6f605fcca3a0557698071c88f8e83 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Sat, 26 Oct 2024 15:40:31 +0200 Subject: [PATCH 13/73] Support case of (weight, scale) -> (c_weight, zp) --- nncf/openvino/graph/node_utils.py | 4 +- .../weight_compression/openvino_backend.py | 6 +- .../weight_compression/openvino_modeling.py | 167 ++++++++++-------- .../weight_compression/weight_lowering.py | 34 +++- nncf/results_caching.py | 11 ++ 5 files changed, 136 insertions(+), 86 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 39056d65af5..33d67140d16 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -8,7 +8,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os + from typing import Any, Callable, Dict, List, Optional, Tuple, Type import numpy as np @@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int: return cnt_if_op(model, 0) -def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray: +def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = False) -> np.ndarray: """ Returns the constant tensor for the node. This method is applicable only for the floating-point constant data. diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 5c328e372b0..8fbd0e2935a 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -299,8 +299,10 @@ def transform_model( const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() weight = get_const_value(const_node, cast_bf16_to_fp32=False) - if const_dtype == ov.Type.bf16: - weight = ov.Tensor(weight, weight.shape, ov.Type.bf16) + # Creation of ov.Tensor is required for two reasons: + # 1. To be able to process BF16 weight properly + # 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed + weight = ov.Tensor(weight, weight.shape, const_dtype) weight = Tensor(weight) should_add_convert_node = False diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 1008e872ba7..2f223d71d06 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -11,7 +11,7 @@ from dataclasses import dataclass from functools import partial -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple, Union import numpy as np import openvino as ov @@ -35,6 +35,7 @@ @dataclass class OVModelParameters: input_dtype: TensorDataType + output_dtype: Optional[TensorDataType] = None dynamic_shapes: bool = False recompile: bool = False release_memory: bool = True @@ -56,30 +57,28 @@ def __hash__(self): ) -def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList: - # Returns results as numpy tensors +def run_model( + ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList +) -> TensorList: if any(isinstance(it, Tensor) for it in inputs): inputs = [inp.data for inp in inputs] - outputs = compiled_model( - inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs - ) - outputs = [Tensor(outputs[i]) for i in range(len(outputs))] - if ov_model_params.release_memory: - compiled_model.release_memory() - return outputs + if return_ov_tensors: + infer_request = compiled_model.create_infer_request() + infer_request.infer( + inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs + ) + outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] + else: + outputs = compiled_model( + inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs + ) + outputs = [outputs[i] for i in range(len(outputs))] + outputs = [Tensor(it) for it in outputs] -def run_model_via_infer_request( - ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList -) -> TensorList: - # Returns results as ov tensors - if any(isinstance(it, Tensor) for it in inputs): - inputs = [inp.data for inp in inputs] - infer_request = compiled_model.create_infer_request() - infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) - outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))] if ov_model_params.release_memory: compiled_model.release_memory() + return outputs @@ -93,8 +92,6 @@ def get_compress_weight_model( ) -> ModelCallable: if scale_shape is None and zero_point_shape is not None: raise Exception("Zero point shape can only be provided if scale shape is provided.") - # if (scale_shape is None) != (reduction_axes is not None): - # raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.") if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) @@ -103,9 +100,6 @@ def get_compress_weight_model( if zero_point_shape is not None: zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) - if config.num_bits == 4: - ov_model_params.return_ov_tensors = True - return _build_compress_model( config, ov_model_params, @@ -150,28 +144,29 @@ def _build_compress_model( zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, -) -> ModelCallable: +) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]: weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype]) ov_parameters = [weight] - if scale_shape is not None: - # Compute only the compressed weight + mode = config.mode + asym_mode = mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] + num_bits = config.num_bits + eps = np.finfo(np.float32).eps + if asym_mode: + level_low = 0 + level_high = 2**num_bits - 1 + else: + level_low = -(2 ** (num_bits - 1)) + level_high = 2 ** (num_bits - 1) - 1 + min_values = None + if scale_shape is not None: + # Scale is given as an input scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32) ov_parameters.append(scale) - - zero_point = None - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) - ov_parameters.append(zero_point) - zero_point = opset.convert(zero_point, ov.Type.f32) else: - # Compute compressed weight, scale and, possibly, zero point - - mode = config.mode - num_bits = config.num_bits - eps = np.finfo(np.float32).eps - if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + # Compute scale + if asym_mode: min_values = opset.reduce_min( weight, reduction_axes=reduction_axes, keep_dims=True ) # [a1, r, a2] -> [a1, 1, a2] @@ -180,49 +175,64 @@ def _build_compress_model( ) # [a1, r, a2] -> [a1, 1, a2] min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) - level_low = 0 - level_high = 2**num_bits - 1 levels = level_high - level_low + 1 scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32) scale = opset.select(opset.abs(scale) < eps, eps, scale) - - zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) - zero_point = opset.clamp(zero_point, level_low, level_high) else: - zero_point = None - level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32) - w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max) - scale /= level_high + scale /= opset.constant(level_high, ov.Type.f32) scale = opset.select(opset.abs(scale) < eps, eps, scale) + zero_point = None + if zero_point_shape is not None: + # Zero point is given as an input + zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) + ov_parameters.append(zero_point) + zero_point = opset.convert(zero_point, ov.Type.f32) + elif mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + # Compute zero point + if min_values is None: + min_values = opset.reduce_min( + weight, reduction_axes=reduction_axes, keep_dims=True + ) # [a1, r, a2] -> [a1, 1, a2] + min_values = opset.convert(min_values, ov.Type.f32) + + level_low = 0 + level_high = 2**num_bits - 1 + zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) + zero_point = opset.clamp(zero_point, level_low, level_high) + if weight.get_element_type() != ov.Type.f32: weight = opset.convert(weight, ov.Type.f32) compressed_w = weight / scale - num_bits = config.num_bits - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 - level_low = 0 - level_high = 2**num_bits - 1 + if asym_mode: + if ov_model_params.output_dtype is not None: + dtype = OV_DTYPE_MAP[ov_model_params.output_dtype] + else: + dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 compressed_w += zero_point - elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: - dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4 - level_low = -(2 ** (num_bits - 1)) - level_high = 2 ** (num_bits - 1) - 1 else: - raise Exception + if ov_model_params.output_dtype is not None: + dtype = OV_DTYPE_MAP[ov_model_params.output_dtype] + else: + dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4 compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high) compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights") ov_results = [compressed_w] - if len(ov_parameters) == 1: - ov_results.append(scale) + if len(ov_parameters) != 3: + # Two cases: + # 1. weight -> compressed_weight, scale, (zero_point) + # 2. weight, scale -> compressed_weight, (zero_point) + if len(ov_parameters) == 1: + ov_results.append(scale) + if zero_point is not None: ov_results.append(opset.convert(zero_point, compressed_w.get_element_type())) @@ -232,8 +242,7 @@ def _build_compress_model( model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model - return partial(run_fn, ov_model_params, compiled_model) + return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) @cache_results(OV_MODEL_CACHE) @@ -249,25 +258,32 @@ def _build_compress_decompress_model( ) if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: - if len(ov_results) == 1: - compressed_w = ov_results[0] - s, zp = ov_parameters[1], ov_parameters[2] + if len(ov_parameters) == 1: + # weight -> compressed_weight, scale, zero_point + compressed_w, scale, zero_point = ov_results + elif len(ov_parameters) == 2: + # weight, scale -> compressed_weight, zero_point + compressed_w, zero_point = ov_results + scale = ov_parameters[1] else: - compressed_w, s, zp = ov_results - decompressed_w = (compressed_w - zp) * s - else: - if len(ov_results) == 1: + # weight, scale, zero_point -> compressed_weight compressed_w = ov_results[0] - s = ov_parameters[1] + scale, zero_point = ov_parameters[1:] + decompressed_w = opset.convert(opset.convert(compressed_w, ov.Type.i32) - zero_point, ov.Type.f32) * scale + else: + if len(ov_parameters) == 1: + # weight -> compressed_weight, scale + compressed_w, scale = ov_results else: - compressed_w, s = ov_results - decompressed_w = compressed_w * s + # weight, scale -> compressed_weight + compressed_w = ov_results[0] + scale = ov_parameters[1] + decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale model = ov.Model([decompressed_w], ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model - return partial(run_fn, ov_model_params, compiled_model) + return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable: @@ -283,5 +299,4 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dt model = ov.Model([res], [arg]) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model - return partial(run_fn, ov_model_params, compiled_model) + return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 3af76eed391..8e0c4cd403d 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -481,14 +481,23 @@ def do_int_quantization( scale_shape = None if precomputed_scale is None else precomputed_scale.shape zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape + asym_mode = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] if ov_model_params is None: - # ov_model_params = OVModelParameters(weight.dtype) + output_dtype = None + return_ov_tensors = False + if config.num_bits == 4: + if weight.backend == TensorBackend.ov: + return_ov_tensors = weight.backend == TensorBackend.ov + else: + output_dtype = TensorDataType.uint8 if asym_mode else TensorDataType.int8 ov_model_params = OVModelParameters( - weight.dtype, + input_dtype=weight.dtype, + output_dtype=output_dtype, dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))), recompile=bool(int(os.environ.get("RECOMPILE", "0"))), release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))), share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))), + return_ov_tensors=return_ov_tensors, ) model = get_compress_weight_model( @@ -501,14 +510,27 @@ def do_int_quantization( ) if precomputed_scale is None: - compressed_weight, scale, zero_point = model([weight]) + # weight -> compressed_weight, scale, (zero_point) + results = model([weight]) + if asym_mode: + compressed_weight, scale, zero_point = results + else: + compressed_weight, scale = results + zero_point = None + # Scale is always in fp32 so there is no need to store it in ov.Tensor if scale.backend == TensorBackend.ov: scale = scale.to_backend(TensorBackend.numpy) + elif precomputed_zero_point is None and asym_mode: + # weight, scale -> compressed_weight, zero_point + compressed_weight, zero_point = model([weight, precomputed_scale]) + scale = precomputed_scale else: - inputs = [weight, precomputed_scale] - if precomputed_zero_point is not None: - inputs += [precomputed_zero_point] + inputs = ( + [weight, precomputed_scale] + if precomputed_zero_point is None + else [weight, precomputed_scale, precomputed_zero_point] + ) compressed_weight = model(inputs)[0] scale, zero_point = precomputed_scale, precomputed_zero_point diff --git a/nncf/results_caching.py b/nncf/results_caching.py index 4a991a36be7..d1d16ea775b 100644 --- a/nncf/results_caching.py +++ b/nncf/results_caching.py @@ -1,3 +1,14 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import inspect From 283a821805e97d16ede321105a4ce97fb77ef7f9 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 28 Oct 2024 09:51:34 +0100 Subject: [PATCH 14/73] SE improvements --- .../algorithms/weight_compression/awq.py | 3 +- .../algorithms/weight_compression/config.py | 5 ++ .../weight_compression/mixed_precision.py | 2 +- .../weight_compression/openvino_modeling.py | 12 ++--- .../weight_compression/scale_estimation.py | 10 ++-- .../weight_compression/weight_lowering.py | 48 +++++++++++++------ .../quantization/test_weights_compression.py | 4 +- 7 files changed, 54 insertions(+), 30 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py index 1b43f5339c4..4d78f8f8f4f 100644 --- a/nncf/quantization/algorithms/weight_compression/awq.py +++ b/nncf/quantization/algorithms/weight_compression/awq.py @@ -261,8 +261,9 @@ def apply( g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale) g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale) else: + # TODO: Improve by replacing with quantize_dequantize g_compressed_weighs, g_c_scale, g_c_zp = do_int_quantization( - weights_to_fake_quantize, reduction_axis, awq_config + weights_to_fake_quantize, awq_config, reduction_axis ) g_decompressed_weighs = do_int_dequantization(g_compressed_weighs, g_c_scale, g_c_zp) sacts = gacts / fns.unsqueeze(cur_scale, 1) diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index ce512331349..03590fc5ff3 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -40,6 +40,11 @@ def num_bits(self): """ return 8 if self.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM] else 4 + @property + def is_int_asym(self): + return self.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT8_ASYM] + + @property def is_integer(self): """ :return: True if compression type in integer, else False. diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py index 53d44c97748..247f8daf6cf 100644 --- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py +++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py @@ -329,7 +329,7 @@ def _calc_weight_sensitivity( if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - compressed_weights, scale, zero_point = do_int_quantization(weight, reduction_axes, backup_config) + compressed_weights, scale, zero_point = do_int_quantization(weight, backup_config, reduction_axes) decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point) decompressed_weight = decompressed_weight.reshape(orig_shape) return fns.linalg.norm(decompressed_weight - weight, ord="fro").item() diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 2f223d71d06..06e1f2ddd70 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -148,11 +148,9 @@ def _build_compress_model( weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype]) ov_parameters = [weight] - mode = config.mode - asym_mode = mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] num_bits = config.num_bits eps = np.finfo(np.float32).eps - if asym_mode: + if config.is_int_asym: level_low = 0 level_high = 2**num_bits - 1 else: @@ -166,7 +164,7 @@ def _build_compress_model( ov_parameters.append(scale) else: # Compute scale - if asym_mode: + if config.is_int_asym: min_values = opset.reduce_min( weight, reduction_axes=reduction_axes, keep_dims=True ) # [a1, r, a2] -> [a1, 1, a2] @@ -193,7 +191,7 @@ def _build_compress_model( zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) ov_parameters.append(zero_point) zero_point = opset.convert(zero_point, ov.Type.f32) - elif mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + elif config.is_int_asym: # Compute zero point if min_values is None: min_values = opset.reduce_min( @@ -210,7 +208,7 @@ def _build_compress_model( weight = opset.convert(weight, ov.Type.f32) compressed_w = weight / scale - if asym_mode: + if config.is_int_asym: if ov_model_params.output_dtype is not None: dtype = OV_DTYPE_MAP[ov_model_params.output_dtype] else: @@ -257,7 +255,7 @@ def _build_compress_decompress_model( config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True ) - if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + if config.is_int_asym: if len(ov_parameters) == 1: # weight -> compressed_weight, scale, zero_point compressed_w, scale, zero_point = ov_results diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index e294c6e0f5d..68e161b5a8e 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -28,7 +28,6 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization @@ -221,7 +220,8 @@ def calculate_quantization_params( q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis) zp = None else: - compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config) + # TODO: Improve by replacing with quantize_dequantize with additional outputs + compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis) if zp is not None: zp = zp.astype(scale.dtype) q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) @@ -297,7 +297,8 @@ def calculate_quantization_params( if config.mode == CompressWeightsMode.NF4: out = do_nf4_quantization(original_weight, near_to_ideal_scale) else: - out = calculate_quantized_weight(original_weight, config, near_to_ideal_scale, zp) + out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale, + precomputed_zero_point=zp) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) @@ -310,7 +311,8 @@ def calculate_quantization_params( if config.mode == CompressWeightsMode.NF4: out = do_nf4_quantization(original_weight, scaled_scale) else: - out = calculate_quantized_weight(original_weight, config, scaled_scale, zp) + out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=scaled_scale, + precomputed_zero_point=zp) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 8e0c4cd403d..d9db96e7e77 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -270,14 +270,13 @@ def calculate_integer_quantization_params( :param config: Weight compression configuration. :return: Scale and zero point tensors. """ - mode = config.mode - assert config.is_integer(), "The function supports integer quantization only" + assert config.is_integer, "The function supports integer quantization only" num_bits = config.num_bits if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + if config.is_int_asym: level_low = 0 level_high = 2**num_bits - 1 min_values = fns.min(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] @@ -314,7 +313,7 @@ def calculate_quantized_weight( scale = scale.astype(TensorDataType.float32) num_bits = config.num_bits - asym_quant = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] + asym_quant = config.is_int_asym dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8 level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 @@ -349,7 +348,7 @@ def get_integer_quantization_error( weight = weight.astype(TensorDataType.float32) compressed_weights, scale, zero_point = do_int_quantization( - weight, reduction_axes, config, invert_division=invert_division + weight, config, reduction_axes, invert_division=invert_division ) decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point) @@ -378,7 +377,7 @@ def compress_weight( :param precomputed_zero_point: Precomputed zero point. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ - if not config.is_integer(): + if not config.is_integer: if weight.backend == TensorBackend.ov: weight = weight.to_backend(TensorBackend.numpy) @@ -387,7 +386,7 @@ def compress_weight( ) return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_int_quantization( - weight, reduction_axes, config, precomputed_scale, precomputed_zero_point, invert_division=invert_division + weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division=invert_division ) return CompressedWeight(compressed_weight, scale, zero_point) @@ -436,14 +435,28 @@ def do_int_dequantization( def do_int_quantization( weight: Tensor, - reduction_axes: ReductionAxes, config: WeightCompressionConfig, + reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None, ): - assert config.is_integer(), "The function supports integer quantization only" + """ + Performs integer quantization on the given weight tensor. + + :param weight: The weight tensor to quantize. + :param config: The weight compression configuration. + :param reduction_axes: Axes along which to reduce (collect) statistics (e.g., min, max). Not required if + precomputed scale (and zero point) are provided. + :param precomputed_scale: Optional precomputed scale tensor. + :param precomputed_zero_point: Optional precomputed zero point tensor. + :param invert_division: Whether to apply inversion for scale and then multiply by weights instead of division. + Defaults to False. + :param ov_model_params: OpenVINO model parameters for acceleration. + :return: A tuple containing the compressed weights, scale, and zero point tensors. + """ + assert config.is_integer, "The function supports integer quantization only" accelerate_through_ov = ( is_openvino_available() @@ -453,7 +466,8 @@ def do_int_quantization( if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") - if config.group_size != -1: + # When reduction axes are not provided, assuming that the weights are already reshaped + if config.group_size != -1 and reduction_axes is not None: # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) @@ -467,7 +481,7 @@ def do_int_quantization( weight = weight.astype(TensorDataType.float32) scale, zero_point = None, None - if precomputed_zero_point is None or precomputed_zero_point is None: + if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None): scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) if precomputed_scale is not None: scale = precomputed_scale @@ -481,7 +495,6 @@ def do_int_quantization( scale_shape = None if precomputed_scale is None else precomputed_scale.shape zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape - asym_mode = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] if ov_model_params is None: output_dtype = None return_ov_tensors = False @@ -489,7 +502,12 @@ def do_int_quantization( if weight.backend == TensorBackend.ov: return_ov_tensors = weight.backend == TensorBackend.ov else: - output_dtype = TensorDataType.uint8 if asym_mode else TensorDataType.int8 + output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8 + # ov_model_params = OVModelParameters( + # input_dtype=weight.dtype, + # output_dtype=output_dtype, + # return_ov_tensors=return_ov_tensors, + # ) ov_model_params = OVModelParameters( input_dtype=weight.dtype, output_dtype=output_dtype, @@ -512,7 +530,7 @@ def do_int_quantization( if precomputed_scale is None: # weight -> compressed_weight, scale, (zero_point) results = model([weight]) - if asym_mode: + if config.is_int_asym: compressed_weight, scale, zero_point = results else: compressed_weight, scale = results @@ -521,7 +539,7 @@ def do_int_quantization( # Scale is always in fp32 so there is no need to store it in ov.Tensor if scale.backend == TensorBackend.ov: scale = scale.to_backend(TensorBackend.numpy) - elif precomputed_zero_point is None and asym_mode: + elif precomputed_zero_point is None and config.is_int_asym: # weight, scale -> compressed_weight, zero_point compressed_weight, zero_point = model([weight, precomputed_scale]) scale = precomputed_scale diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 5d89c75e542..ccf539aeb86 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1031,7 +1031,7 @@ def test_np_ov_compression_decompression(mode): config = WeightCompressionConfig(mode) - compressed_weighs, scale, zp = do_int_quantization(w, -1, config, invert_scale=True) + compressed_weighs, scale, zp = do_int_quantization(w, config, -1, invert_division=True) decompressed_weighs = do_int_dequantization(compressed_weighs, scale, zp) compressed_weighs = compressed_weighs.data @@ -1067,7 +1067,7 @@ def test_compressed_weighs_range(mode, data): w = Tensor(data) config = WeightCompressionConfig(mode=mode) - compressed_weighs, _, _ = do_int_quantization(w, -1, config) + compressed_weighs, _, _ = do_int_quantization(w, config, -1) assert np.allclose(np.abs(compressed_weighs.data), np.abs(w.data)) From 69648449a7cc0d4f642b14ab8e12f8a69912e08d Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 28 Oct 2024 13:34:23 +0100 Subject: [PATCH 15/73] Accelerate AWQ --- .../algorithms/weight_compression/awq.py | 10 +++--- .../weight_compression/openvino_modeling.py | 14 +++++--- .../weight_compression/scale_estimation.py | 10 +++--- .../weight_compression/weight_lowering.py | 32 ++++++++++++------- 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py index 4d78f8f8f4f..ea59258b940 100644 --- a/nncf/quantization/algorithms/weight_compression/awq.py +++ b/nncf/quantization/algorithms/weight_compression/awq.py @@ -31,11 +31,11 @@ from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization from nncf.quantization.passes import transform_to_inference_graph +from nncf.tensor import TensorDataType from nncf.tensor import functions as fns TModel = TypeVar("TModel") @@ -241,7 +241,7 @@ def apply( offset = gi * group_size gscale = s[offset : offset + group_size] - a_min = fns.quantile(gscale, 0.1) + a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32) a_max = 1e2 gscale = fns.clip(gscale, a_min=a_min, a_max=a_max) @@ -261,11 +261,9 @@ def apply( g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale) g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale) else: - # TODO: Improve by replacing with quantize_dequantize - g_compressed_weighs, g_c_scale, g_c_zp = do_int_quantization( + g_decompressed_weighs = calculate_quantized_dequantized_weight( weights_to_fake_quantize, awq_config, reduction_axis ) - g_decompressed_weighs = do_int_dequantization(g_compressed_weighs, g_c_scale, g_c_zp) sacts = gacts / fns.unsqueeze(cur_scale, 1) cur_out = fns.matmul(g_decompressed_weighs, sacts) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 06e1f2ddd70..c26c095a70c 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -116,8 +116,9 @@ def get_compress_decompress_weight_model( ov_model_params: OVModelParameters, config: WeightCompressionConfig, weight_shape: Tuple, - scale_shape: Optional[Tuple], + scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, + reduction_axes: Optional[Tuple] = None, ) -> ModelCallable: if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) @@ -131,6 +132,7 @@ def get_compress_decompress_weight_model( weight_shape, scale_shape, zero_point_shape, + reduction_axes, disable_caching=ov_model_params.recompile, ) @@ -248,11 +250,12 @@ def _build_compress_decompress_model( config: WeightCompressionConfig, ov_model_params: OVModelParameters, weight_shape: Tuple, - scale_shape: Tuple, + scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, + reduction_axes: Optional[Tuple] = None, ) -> ModelCallable: ov_parameters, ov_results = _build_compress_model( - config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True + config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True ) if config.is_int_asym: @@ -267,7 +270,10 @@ def _build_compress_decompress_model( # weight, scale, zero_point -> compressed_weight compressed_w = ov_results[0] scale, zero_point = ov_parameters[1:] - decompressed_w = opset.convert(opset.convert(compressed_w, ov.Type.i32) - zero_point, ov.Type.f32) * scale + + decompressed_w = scale * opset.convert( + opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32), ov.Type.f32 + ) else: if len(ov_parameters) == 1: # weight -> compressed_weight, scale diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 68e161b5a8e..bb814724df3 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -297,8 +297,9 @@ def calculate_quantization_params( if config.mode == CompressWeightsMode.NF4: out = do_nf4_quantization(original_weight, near_to_ideal_scale) else: - out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale, - precomputed_zero_point=zp) + out, _, _ = do_int_quantization( + original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp + ) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) @@ -311,8 +312,9 @@ def calculate_quantization_params( if config.mode == CompressWeightsMode.NF4: out = do_nf4_quantization(original_weight, scaled_scale) else: - out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=scaled_scale, - precomputed_zero_point=zp) + out, _, _ = do_int_quantization( + original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp + ) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index d9db96e7e77..2f70e7e47a9 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -558,8 +558,9 @@ def do_int_quantization( def calculate_quantized_dequantized_weight( weight: Tensor, config: WeightCompressionConfig, - scale: Tensor, - zero_point: Optional[Tensor] = None, + reduction_axes: Optional[ReductionAxes] = None, + precomputed_scale: Optional[Tensor] = None, + precomputed_zero_point: Optional[Tensor] = None, invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None, ) -> Tensor: @@ -573,23 +574,32 @@ def calculate_quantized_dequantized_weight( if not accelerate_through_ov: # Reference implementation - compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) + if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None): + compressed_weight, scale, zero_point = do_int_quantization( + weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division + ) + else: + scale = precomputed_scale if precomputed_scale is not None else None + zero_point = precomputed_zero_point if precomputed_zero_point is not None else None + compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) return decompressed_weight weight_shape = weight.shape - scale_shape = scale.shape - zero_point_shape = None if zero_point is None else zero_point.shape + scale_shape = precomputed_scale.shape if precomputed_scale is not None else None + zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None if ov_model_params is None: ov_model_params = OVModelParameters(weight.dtype) - if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: - ov_model_params.dynamic = False - model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape) + model = get_compress_decompress_weight_model( + ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes + ) - inputs = [weight, scale] - if zero_point is not None: - inputs.append(zero_point) + inputs = [weight] + if precomputed_scale is not None: + inputs.append(precomputed_scale) + if precomputed_zero_point is not None: + inputs.append(precomputed_zero_point) decompressed_weight = model(inputs)[0] return decompressed_weight From 80e2c928171e0fa5340c1df5f5cba02ef14eb4a6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 29 Oct 2024 10:47:20 +0100 Subject: [PATCH 16/73] SE changes --- .../weight_compression/openvino_modeling.py | 7 +- .../weight_compression/scale_estimation.py | 16 +++-- .../weight_compression/weight_lowering.py | 67 +++++++++++-------- 3 files changed, 55 insertions(+), 35 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index c26c095a70c..2840d32e8b2 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -47,6 +47,7 @@ def __hash__(self): return hash( ( self.input_dtype, + self.output_dtype, self.dynamic_shapes, self.recompile, self.release_memory, @@ -119,6 +120,7 @@ def get_compress_decompress_weight_model( scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, + return_compressed_weight: Optional[bool] = False, ) -> ModelCallable: if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) @@ -133,6 +135,7 @@ def get_compress_decompress_weight_model( scale_shape, zero_point_shape, reduction_axes, + return_compressed_weight, disable_caching=ov_model_params.recompile, ) @@ -253,6 +256,7 @@ def _build_compress_decompress_model( scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, + return_compressed_weight: Optional[bool] = False, ) -> ModelCallable: ov_parameters, ov_results = _build_compress_model( config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True @@ -284,7 +288,8 @@ def _build_compress_decompress_model( scale = ov_parameters[1] decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale - model = ov.Model([decompressed_w], ov_parameters) + ov_results = [decompressed_w] + ov_results if return_compressed_weight else [decompressed_w] + model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index bb814724df3..b35188d05ae 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -28,7 +28,6 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization @@ -220,11 +219,11 @@ def calculate_quantization_params( q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis) zp = None else: - # TODO: Improve by replacing with quantize_dequantize with additional outputs - compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis) + q_weights, compressed_weights, scale, zp = calculate_quantized_dequantized_weight( + original_weight, cur_config, reduction_axis, return_compressed_weight=True + ) if zp is not None: zp = zp.astype(scale.dtype) - q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) s = fns.unsqueeze(s, 0) s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) @@ -243,7 +242,6 @@ def calculate_quantization_params( importance = importance / (denum + eps) X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) - q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) best_diffs = None result_scale = None @@ -269,7 +267,9 @@ def calculate_quantization_params( g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: - out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp) + out = calculate_quantized_dequantized_weight( + original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp + ) q_weights_ = fns.zeros_like(original_weight) + out q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) @@ -326,7 +326,9 @@ def calculate_quantization_params( g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: - out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp) + out = calculate_quantized_dequantized_weight( + original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp + ) q_weights_ = fns.zeros_like(original_weight) + out q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 2f70e7e47a9..265b624c872 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -11,7 +11,7 @@ import logging import os from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import numpy as np @@ -496,27 +496,16 @@ def do_int_quantization( zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape if ov_model_params is None: - output_dtype = None - return_ov_tensors = False - if config.num_bits == 4: - if weight.backend == TensorBackend.ov: - return_ov_tensors = weight.backend == TensorBackend.ov - else: - output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8 - # ov_model_params = OVModelParameters( - # input_dtype=weight.dtype, - # output_dtype=output_dtype, - # return_ov_tensors=return_ov_tensors, - # ) - ov_model_params = OVModelParameters( - input_dtype=weight.dtype, - output_dtype=output_dtype, - dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))), - recompile=bool(int(os.environ.get("RECOMPILE", "0"))), - release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))), - share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))), - return_ov_tensors=return_ov_tensors, - ) + ov_model_params = OVModelParameters(weight.dtype) + if config.num_bits == 4: + if weight.backend == TensorBackend.ov: + ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov + else: + ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8 + ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) + ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) + ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) + ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) model = get_compress_weight_model( ov_model_params, @@ -562,8 +551,9 @@ def calculate_quantized_dequantized_weight( precomputed_scale: Optional[Tensor] = None, precomputed_zero_point: Optional[Tensor] = None, invert_division: Optional[bool] = False, + return_compressed_weight: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None, -) -> Tensor: +) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: accelerate_through_ov = ( is_openvino_available() and weight.backend != TensorBackend.torch @@ -583,7 +573,15 @@ def calculate_quantized_dequantized_weight( zero_point = precomputed_zero_point if precomputed_zero_point is not None else None compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) - return decompressed_weight + if return_compressed_weight: + return decompressed_weight, compressed_weight, scale, zero_point + else: + return decompressed_weight + + # When reduction axes are not provided, assuming that the weights are already reshaped + if config.group_size != -1 and reduction_axes is not None: + # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] + weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) weight_shape = weight.shape scale_shape = precomputed_scale.shape if precomputed_scale is not None else None @@ -591,9 +589,11 @@ def calculate_quantized_dequantized_weight( if ov_model_params is None: ov_model_params = OVModelParameters(weight.dtype) + if return_compressed_weight and config.num_bits == 4: + ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8 model = get_compress_decompress_weight_model( - ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes + ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight ) inputs = [weight] @@ -601,5 +601,18 @@ def calculate_quantized_dequantized_weight( inputs.append(precomputed_scale) if precomputed_zero_point is not None: inputs.append(precomputed_zero_point) - decompressed_weight = model(inputs)[0] - return decompressed_weight + + compressed_weight, scale, zero_point = None, None, None + results = model(inputs) + if len(results) == 1: + decompressed_weight = results[0] + elif len(results) == 2: + decompressed_weight, compressed_weight = results + elif len(results) == 3: + decompressed_weight, compressed_weight, scale = results + else: + decompressed_weight, compressed_weight, scale, zero_point = results + if return_compressed_weight: + return decompressed_weight, compressed_weight, scale, zero_point + else: + return decompressed_weight From fc828664d0718eea5ae23ab3b85d5ad2cbddd9f8 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 29 Oct 2024 15:52:57 +0100 Subject: [PATCH 17/73] Add access counts to caching decorator --- compare_inference_time.py | 125 ++++++++++++++++++++++++++++++++++++++ nncf/results_caching.py | 5 ++ 2 files changed, 130 insertions(+) create mode 100644 compare_inference_time.py diff --git a/compare_inference_time.py b/compare_inference_time.py new file mode 100644 index 00000000000..f11884dbd0f --- /dev/null +++ b/compare_inference_time.py @@ -0,0 +1,125 @@ +import gc +import time + +import numpy as np +from unittest.mock import patch +from tqdm import tqdm + +from nncf import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization, calculate_quantized_dequantized_weight +from nncf.tensor import Tensor +import nncf.utils + + +def get_random_weights(size, amount, n_unique_shapes, dtype, is_sorted=True): + n_channels = set() + while len(n_channels) < n_unique_shapes: + n_channels.add(int(np.random.normal(np.sqrt(size), n_unique_shapes))) + n_channels = list(n_channels) + + unique_weights = [] + for d in n_channels: + shape = (size // d, d) + unique_weights.append(Tensor(np.random.random(shape).astype(dtype))) + + result = [] + for _ in range(amount): + result.append(np.random.choice(unique_weights)) + + if is_sorted: + result = sorted(result, key=lambda x: x.shape[0] * x.shape[1], reverse=True) + return result + + +def measure_compression_time(weights, config, is_ov, verbose=True): + orig_value = nncf.utils._openvino_available + nncf.utils._openvino_available = is_ov + + start_time = time.perf_counter() + for w in tqdm(weights, disable=not verbose): + do_int_quantization( + # calculate_quantized_dequantized_weight( + w, + config, + reduction_axes=(1,), + ov_model_params=OVModelParameters( + input_dtype=w.dtype, + output_dtype=None, + dynamic_shapes=bool(0), + recompile=bool(0), + release_memory=bool(1), + share_inputs=bool(1), + share_outputs=bool(1), + return_ov_tensors=bool(0), + ), + # return_compressed_weight=bool(1) + ) + end_time = time.perf_counter() + total_time = end_time - start_time + avg_time = total_time / len(weights) + if verbose: + print("OV" if is_ov else "NP", f"avg. time: {avg_time:.1e} sec.") + + nncf.utils._openvino_available = orig_value + OV_MODEL_CACHE.clear() + gc.collect() + return avg_time + + +def bin_search(l, r, config, n, dtype): + while r / l > 1.05: + m = np.sqrt(l * r) + weights = get_random_weights( + size=int(m), + amount=n, + # n_unique_shapes=int(np.sqrt(n)), + n_unique_shapes=1, + dtype=dtype + ) + t_np = measure_compression_time( + weights, + config, + is_ov=False, + verbose=False, + ) + t_ov = measure_compression_time( + weights, + config, + is_ov=True, + verbose=False, + ) + print(f"S: {m:.1e}. NP time: {t_np:.1e} sec. OV time: {t_ov:.1e} sec.") + if t_np < t_ov: + l = m + else: + r = m + + +N = int(1e5) +S = int(5e5) # 5e5 for compression/decompression, +K = int(np.sqrt(N)) +DTYPE = np.float32 + +bin_search( + l=int(1e2), + r=int(1e5), + config=WeightCompressionConfig( + CompressWeightsMode.INT4_ASYM, + group_size=-1 + ), + n=N, + dtype=DTYPE, +) + +# weights = get_random_weights(size=S, amount=N, n_unique_shapes=K, dtype=np.float32) +# for is_ov in [False, True]: +# measure_compression_time( +# weights, +# WeightCompressionConfig( +# CompressWeightsMode.INT4_ASYM, +# group_size=-1 +# ), +# is_ov=is_ov, +# ) diff --git a/nncf/results_caching.py b/nncf/results_caching.py index d1d16ea775b..5d8b7fa99c9 100644 --- a/nncf/results_caching.py +++ b/nncf/results_caching.py @@ -10,22 +10,27 @@ # limitations under the License. import inspect +from collections import defaultdict class ResultsCacheContainer: def __init__(self): self._cache = {} + self._access_count = {} def clear(self): self._cache.clear() + self._access_count.clear() def is_empty(self): return len(self._cache) == 0 def __getitem__(self, item): + self._access_count[item] += 1 return self._cache[item] def __setitem__(self, key, value): + self._access_count[key] = 0 self._cache[key] = value def __contains__(self, item): From f3891cda50a3bf75cd57dcfaeae4eb13e4231bf1 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 29 Oct 2024 16:18:32 +0100 Subject: [PATCH 18/73] Comment out env vars --- .../algorithms/weight_compression/weight_lowering.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 265b624c872..3eaa024f4c2 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -461,7 +461,7 @@ def do_int_quantization( accelerate_through_ov = ( is_openvino_available() and weight.backend != TensorBackend.torch - and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) ) if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") @@ -502,10 +502,10 @@ def do_int_quantization( ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov else: ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8 - ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) - ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) - ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) - ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) + # ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) + # ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) + # ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) + # ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) model = get_compress_weight_model( ov_model_params, @@ -557,7 +557,7 @@ def calculate_quantized_dequantized_weight( accelerate_through_ov = ( is_openvino_available() and weight.backend != TensorBackend.torch - and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) ) if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") From 353aac14524eb64aba7dd20661fe3333c248fefe Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 29 Oct 2024 16:43:30 +0100 Subject: [PATCH 19/73] Fix existing tests --- .github/workflows/precommit.yml | 2 ++ compare_inference_time.py | 21 +++++++------ nncf/openvino/graph/node_utils.py | 2 +- .../weight_compression/openvino_backend.py | 7 +++-- .../weight_compression/openvino_modeling.py | 6 ++-- .../weight_compression/scale_estimation.py | 1 + .../weight_compression/weight_lowering.py | 18 ++++++----- nncf/results_caching.py | 1 - .../template_test_nncf_tensor.py | 20 +++++++++++-- tests/openvino/native/models.py | 4 +-- .../quantization/test_weights_compression.py | 30 ------------------- 11 files changed, 51 insertions(+), 61 deletions(-) diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index cbed985e3de..1772e2619ad 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -64,6 +64,8 @@ jobs: cache: pip - name: Install NNCF and test requirements run: make install-openvino-test + - name: Install OpenVINO Nightly + run: pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Print installed modules run: pip list - name: Run OV precommit test scope diff --git a/compare_inference_time.py b/compare_inference_time.py index f11884dbd0f..452e3e57afd 100644 --- a/compare_inference_time.py +++ b/compare_inference_time.py @@ -1,16 +1,18 @@ import gc import time +from unittest.mock import patch import numpy as np -from unittest.mock import patch from tqdm import tqdm +import nncf.utils from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization, calculate_quantized_dequantized_weight +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.tensor import Tensor -import nncf.utils def get_random_weights(size, amount, n_unique_shapes, dtype, is_sorted=True): @@ -40,7 +42,7 @@ def measure_compression_time(weights, config, is_ov, verbose=True): start_time = time.perf_counter() for w in tqdm(weights, disable=not verbose): do_int_quantization( - # calculate_quantized_dequantized_weight( + # calculate_quantized_dequantized_weight( w, config, reduction_axes=(1,), @@ -76,7 +78,7 @@ def bin_search(l, r, config, n, dtype): amount=n, # n_unique_shapes=int(np.sqrt(n)), n_unique_shapes=1, - dtype=dtype + dtype=dtype, ) t_np = measure_compression_time( weights, @@ -98,17 +100,14 @@ def bin_search(l, r, config, n, dtype): N = int(1e5) -S = int(5e5) # 5e5 for compression/decompression, +S = int(5e5) # 5e5 for compression/decompression, K = int(np.sqrt(N)) DTYPE = np.float32 bin_search( l=int(1e2), r=int(1e5), - config=WeightCompressionConfig( - CompressWeightsMode.INT4_ASYM, - group_size=-1 - ), + config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, group_size=-1), n=N, dtype=DTYPE, ) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 33d67140d16..05e759f1b16 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int: return cnt_if_op(model, 0) -def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = False) -> np.ndarray: +def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray: """ Returns the constant tensor for the node. This method is applicable only for the floating-point constant data. diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 8fbd0e2935a..3e14be11561 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -11,8 +11,6 @@ from typing import Dict, Iterable, List, Optional, Tuple import openvino as ov -from openvino import Type -from openvino.properties.hint import inference_precision from openvino.runtime import opset13 as opset from openvino.runtime.op import Constant @@ -336,7 +334,10 @@ def transform_model( weight = weight.to_backend(TensorBackend.numpy) if compressed_weight.tensor.backend == TensorBackend.ov: compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy) - if compressed_weight.zero_point.backend == TensorBackend.ov: + if ( + compressed_weight.zero_point is not None + and compressed_weight.zero_point.backend == TensorBackend.ov + ): compressed_weight.zero_point = compressed_weight.zero_point.to_backend(TensorBackend.numpy) adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params) self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 2840d32e8b2..a1c99241b4c 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -186,8 +186,8 @@ def _build_compress_model( w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) - scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max) - scale /= opset.constant(level_high, ov.Type.f32) + scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max)) + scale /= opset.constant(-level_low, ov.Type.f32) scale = opset.select(opset.abs(scale) < eps, eps, scale) zero_point = None @@ -204,8 +204,6 @@ def _build_compress_model( ) # [a1, r, a2] -> [a1, 1, a2] min_values = opset.convert(min_values, ov.Type.f32) - level_low = 0 - level_high = 2**num_bits - 1 zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) zero_point = opset.clamp(zero_point, level_low, level_high) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index b35188d05ae..9e5fbb3d678 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -217,6 +217,7 @@ def calculate_quantization_params( ) compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True) q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis) + q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) zp = None else: q_weights, compressed_weights, scale, zp = calculate_quantized_dequantized_weight( diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 3eaa024f4c2..14df740d2c9 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -9,7 +9,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import os from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -19,9 +18,6 @@ from nncf.common.logging.logger import log_once from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters -from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model -from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor from nncf.tensor import functions as fns @@ -440,7 +436,7 @@ def do_int_quantization( precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, invert_division: Optional[bool] = False, - ov_model_params: Optional[OVModelParameters] = None, + ov_model_params: Optional["OVModelParameters"] = None, ): """ Performs integer quantization on the given weight tensor. @@ -458,6 +454,7 @@ def do_int_quantization( """ assert config.is_integer, "The function supports integer quantization only" + # import os accelerate_through_ov = ( is_openvino_available() and weight.backend != TensorBackend.torch @@ -471,7 +468,7 @@ def do_int_quantization( # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) - if not accelerate_through_ov: + if not accelerate_through_ov or True: # Reference implementation if weight.backend == TensorBackend.ov: @@ -491,6 +488,9 @@ def do_int_quantization( compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) return compressed_weights, scale, zero_point + from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters + from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model + weight_shape = weight.shape scale_shape = None if precomputed_scale is None else precomputed_scale.shape zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape @@ -552,8 +552,9 @@ def calculate_quantized_dequantized_weight( precomputed_zero_point: Optional[Tensor] = None, invert_division: Optional[bool] = False, return_compressed_weight: Optional[bool] = False, - ov_model_params: Optional[OVModelParameters] = None, + ov_model_params: Optional["OVModelParameters"] = None, ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: + # import os accelerate_through_ov = ( is_openvino_available() and weight.backend != TensorBackend.torch @@ -578,6 +579,9 @@ def calculate_quantized_dequantized_weight( else: return decompressed_weight + from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters + from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model + # When reduction axes are not provided, assuming that the weights are already reshaped if config.group_size != -1 and reduction_axes is not None: # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] diff --git a/nncf/results_caching.py b/nncf/results_caching.py index 5d8b7fa99c9..9b314863108 100644 --- a/nncf/results_caching.py +++ b/nncf/results_caching.py @@ -10,7 +10,6 @@ # limitations under the License. import inspect -from collections import defaultdict class ResultsCacheContainer: diff --git a/tests/cross_fw/test_templates/template_test_nncf_tensor.py b/tests/cross_fw/test_templates/template_test_nncf_tensor.py index 13f2d6bc976..97b59342d58 100644 --- a/tests/cross_fw/test_templates/template_test_nncf_tensor.py +++ b/tests/cross_fw/test_templates/template_test_nncf_tensor.py @@ -1504,7 +1504,15 @@ def test_expand_dims_error(self, x, axis, match): def test_fn_zeros(self): shape = (2, 2) for dtype in TensorDataType: - if dtype == TensorDataType.bfloat16 and self.backend() == TensorBackend.numpy: + if ( + self.backend() == TensorBackend.numpy + and dtype == TensorDataType.bfloat16 + or dtype + in [ + TensorDataType.int4, + TensorDataType.uint4, + ] + ): continue tensor_a = fns.zeros(shape, backend=self.backend(), dtype=dtype, device=self.device()) assert isinstance(tensor_a, Tensor) @@ -1525,7 +1533,15 @@ def test_fn_zeros(self): ) def test_fn_eye(self, n, m, ref): for dtype in TensorDataType: - if dtype == TensorDataType.bfloat16 and self.backend() == TensorBackend.numpy: + if ( + self.backend() == TensorBackend.numpy + and dtype == TensorDataType.bfloat16 + or dtype + in [ + TensorDataType.int4, + TensorDataType.uint4, + ] + ): continue tensor_a = fns.eye(n, m, backend=self.backend(), dtype=dtype, device=self.device()) assert isinstance(tensor_a, Tensor) diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index 5f779bd96e9..48d0807e07f 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -290,11 +290,11 @@ def __init__(self, const_dtype: ov.Type = ov.Type.f32, input_dtype: ov.Type = ov def _create_ov_model(self): input_shape = [1, 3, 4, 2] input_1 = opset.parameter(input_shape, name="Input", dtype=self.input_dtype) - data = opset.constant(value=self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const") + data = opset.constant(self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const") if self.const_dtype != self.input_dtype: data = opset.convert(data, self.input_dtype.to_string()) matmul = opset.matmul(input_1, data, transpose_a=True, transpose_b=False, name="MatMul") - bias = opset.constant(value=self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias") + bias = opset.constant(self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias") if self.const_dtype != self.input_dtype: bias = opset.convert(bias, self.input_dtype.to_string()) add = opset.add(matmul, bias, name="Add") diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index ccf539aeb86..a9623a2ccf4 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -27,7 +27,6 @@ from nncf.data.dataset import Dataset from nncf.experimental.common.tensor_statistics.collectors import AggregatorBase from nncf.openvino.graph.node_utils import get_const_value -from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE from nncf.parameters import BackupMode from nncf.quantization import compress_weights from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams @@ -36,7 +35,6 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization @@ -1023,34 +1021,6 @@ def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids): assert ref_e8m0_nodes == names_e8m0 -@pytest.mark.parametrize("mode", (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM)) -def test_np_ov_compression_decompression(mode): - sz = 60 - w = np.arange(-sz, sz).reshape(2, sz).astype(np.float32) / 9.0 - w = Tensor(w) - - config = WeightCompressionConfig(mode) - - compressed_weighs, scale, zp = do_int_quantization(w, config, -1, invert_division=True) - decompressed_weighs = do_int_dequantization(compressed_weighs, scale, zp) - - compressed_weighs = compressed_weighs.data - decompressed_weighs = decompressed_weighs.data - zp_shape = zp.shape if zp is not None else None - - compress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive(config, w.shape, scale.shape, zp_shape) - compress_decompress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_decompress_weight_primitive( - config, w.shape, scale.shape, zp_shape - ) - - params = [w.data, scale.data, zp.data] if zp is not None else [w.data, scale.data] - compressed_weighs_ov = compress(params) - decompressed_weighs_ov = compress_decompress(params) - - assert np.allclose(compressed_weighs, compressed_weighs_ov) - assert np.allclose(decompressed_weighs, decompressed_weighs_ov) - - @pytest.mark.parametrize( ("mode", "data"), ( From d20e593cbd10e3904b05e0d14778b9ff3b7dd9c1 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 30 Oct 2024 09:42:19 +0100 Subject: [PATCH 20/73] Unstage helper scripts --- .github/workflows/precommit.yml | 2 +- compare_inference_time.py | 124 ------ .../weight_compression/openvino_backend.py | 1 - .../weight_compression/weight_lowering.py | 6 +- run_weight_compression.py | 373 ------------------ weight_compression.py | 210 ---------- 6 files changed, 4 insertions(+), 712 deletions(-) delete mode 100644 compare_inference_time.py delete mode 100644 run_weight_compression.py delete mode 100644 weight_compression.py diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index 1772e2619ad..218d9c32fd1 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -65,7 +65,7 @@ jobs: - name: Install NNCF and test requirements run: make install-openvino-test - name: Install OpenVINO Nightly - run: pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + run: pip install -U --pre openvino==2024.5.0.dev20241015 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Print installed modules run: pip list - name: Run OV precommit test scope diff --git a/compare_inference_time.py b/compare_inference_time.py deleted file mode 100644 index 452e3e57afd..00000000000 --- a/compare_inference_time.py +++ /dev/null @@ -1,124 +0,0 @@ -import gc -import time -from unittest.mock import patch - -import numpy as np -from tqdm import tqdm - -import nncf.utils -from nncf import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization -from nncf.tensor import Tensor - - -def get_random_weights(size, amount, n_unique_shapes, dtype, is_sorted=True): - n_channels = set() - while len(n_channels) < n_unique_shapes: - n_channels.add(int(np.random.normal(np.sqrt(size), n_unique_shapes))) - n_channels = list(n_channels) - - unique_weights = [] - for d in n_channels: - shape = (size // d, d) - unique_weights.append(Tensor(np.random.random(shape).astype(dtype))) - - result = [] - for _ in range(amount): - result.append(np.random.choice(unique_weights)) - - if is_sorted: - result = sorted(result, key=lambda x: x.shape[0] * x.shape[1], reverse=True) - return result - - -def measure_compression_time(weights, config, is_ov, verbose=True): - orig_value = nncf.utils._openvino_available - nncf.utils._openvino_available = is_ov - - start_time = time.perf_counter() - for w in tqdm(weights, disable=not verbose): - do_int_quantization( - # calculate_quantized_dequantized_weight( - w, - config, - reduction_axes=(1,), - ov_model_params=OVModelParameters( - input_dtype=w.dtype, - output_dtype=None, - dynamic_shapes=bool(0), - recompile=bool(0), - release_memory=bool(1), - share_inputs=bool(1), - share_outputs=bool(1), - return_ov_tensors=bool(0), - ), - # return_compressed_weight=bool(1) - ) - end_time = time.perf_counter() - total_time = end_time - start_time - avg_time = total_time / len(weights) - if verbose: - print("OV" if is_ov else "NP", f"avg. time: {avg_time:.1e} sec.") - - nncf.utils._openvino_available = orig_value - OV_MODEL_CACHE.clear() - gc.collect() - return avg_time - - -def bin_search(l, r, config, n, dtype): - while r / l > 1.05: - m = np.sqrt(l * r) - weights = get_random_weights( - size=int(m), - amount=n, - # n_unique_shapes=int(np.sqrt(n)), - n_unique_shapes=1, - dtype=dtype, - ) - t_np = measure_compression_time( - weights, - config, - is_ov=False, - verbose=False, - ) - t_ov = measure_compression_time( - weights, - config, - is_ov=True, - verbose=False, - ) - print(f"S: {m:.1e}. NP time: {t_np:.1e} sec. OV time: {t_ov:.1e} sec.") - if t_np < t_ov: - l = m - else: - r = m - - -N = int(1e5) -S = int(5e5) # 5e5 for compression/decompression, -K = int(np.sqrt(N)) -DTYPE = np.float32 - -bin_search( - l=int(1e2), - r=int(1e5), - config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, group_size=-1), - n=N, - dtype=DTYPE, -) - -# weights = get_random_weights(size=S, amount=N, n_unique_shapes=K, dtype=np.float32) -# for is_ov in [False, True]: -# measure_compression_time( -# weights, -# WeightCompressionConfig( -# CompressWeightsMode.INT4_ASYM, -# group_size=-1 -# ), -# is_ov=is_ov, -# ) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 3e14be11561..24364c592d9 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -278,7 +278,6 @@ def _create_compression_subgraph( if should_add_convert_node: mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert") - # TODO: convert tensors inside compressed_weight to numpy backend if they are in ov backend return mul, compressed_weight def transform_model( diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 14df740d2c9..f65049e6dff 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -436,7 +436,7 @@ def do_int_quantization( precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, invert_division: Optional[bool] = False, - ov_model_params: Optional["OVModelParameters"] = None, + ov_model_params: Optional = None, ): """ Performs integer quantization on the given weight tensor. @@ -468,7 +468,7 @@ def do_int_quantization( # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) - if not accelerate_through_ov or True: + if not accelerate_through_ov: # Reference implementation if weight.backend == TensorBackend.ov: @@ -552,7 +552,7 @@ def calculate_quantized_dequantized_weight( precomputed_zero_point: Optional[Tensor] = None, invert_division: Optional[bool] = False, return_compressed_weight: Optional[bool] = False, - ov_model_params: Optional["OVModelParameters"] = None, + ov_model_params: Optional = None, ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: # import os accelerate_through_ov = ( diff --git a/run_weight_compression.py b/run_weight_compression.py deleted file mode 100644 index 74d752ef4de..00000000000 --- a/run_weight_compression.py +++ /dev/null @@ -1,373 +0,0 @@ -import os -import shutil -import subprocess -import threading -import time -from pathlib import Path - - -def stream_handler(stream, target_file): - for line in iter(stream.readline, ''): - print(line, end='') - target_file.write(line) - - -parent_model_dir = Path("/home/nsavel/workspace/models/hf") -parent_log_dir = Path("compression_logs") - -experiment_params = [ - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"), - # # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - # # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"), - - - - - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"), - (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"), - - - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"), - # - # - # - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"), - # - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"), - # - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"), - # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"), - - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"), - # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"), -] - -for model_dir, log_dir, params in experiment_params: - model_path = model_dir / "openvino_model.xml" - cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}" - - log_dir.mkdir(parents=True, exist_ok=True) - with open(log_dir / "log.txt", "a") as log_file: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - universal_newlines=True, - preexec_fn=os.setsid, - ) - - stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file)) - stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file)) - - stdout_thread.start() - stderr_thread.start() - - stdout_thread.join() - stderr_thread.join() - - process.wait() - time.sleep(10) - -evaluated_paths = set() -for _, log_dir, _ in experiment_params: - for model_path in sorted(log_dir.rglob("**/*")): - model_path: Path - if model_path.suffix != ".xml": - continue - if model_path.absolute() in evaluated_paths: - continue - evaluated_paths.add(model_path.absolute()) - - model_dir = model_path.parent.absolute() - cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}" - process = subprocess.Popen(cmd, shell=True) - process.wait() diff --git a/weight_compression.py b/weight_compression.py deleted file mode 100644 index bae1948145c..00000000000 --- a/weight_compression.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -import shutil -import time -from functools import partial -from pathlib import Path - -import openvino as ov - -import nncf -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE -from tools.memory_monitor import MemoryMonitor -from tools.memory_monitor import MemoryType - - -def parse_arguments(): - parser = argparse.ArgumentParser() - - parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored") - - parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved") - - parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode") - - parser.add_argument("--numpy", action="store_true", help="Enable numpy compression") - - parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models") - - parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype") - - parser.add_argument("--recompile", action="store_true", help="Recompile model every time") - - parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs") - - parser.add_argument("--save-model", action="store_true", help="Save compressed model") - - parser.add_argument("--release-memory", action="store_true", help="Release memory") - - return parser.parse_args() - - -def log(mm, fz, log_dir): - mm.save_memory_logs( - *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else "" - ) - - -def count_node_dtypes(model): - # Get the main dtype of weight constants - node_count_per_dtype = dict(f32=0, f16=0, bf16=0) - for node in model.get_ordered_ops(): - friendly_name = node.get_friendly_name() - if node.get_type_name() != "Constant" or ".weight" not in friendly_name: - continue - const_dtype = node.get_element_type().get_type_name() - if const_dtype in node_count_per_dtype: - node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1 - return node_count_per_dtype - - -def main(args): - model_path = Path(args.model_path) - log_dir = Path(args.log_dir) - - numpy_compression = args.numpy - dynamic_compression = args.dynamic - input_dtype = args.input_dtype - recompile = args.recompile - share_outputs = args.share_outputs - save_model = args.save_model - release_memory = args.release_memory - - log_dir_suffix = f"{model_path.parent.name}_" - if numpy_compression: - log_dir_suffix = f"{log_dir_suffix}numpy" - else: - log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}" - if input_dtype is not None: - log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" - if recompile: - log_dir_suffix = f"{log_dir_suffix}_recompile" - if release_memory: - log_dir_suffix = f"{log_dir_suffix}_release-memory" - if share_outputs: - log_dir_suffix = f"{log_dir_suffix}_share-outputs" - print(f"Log dir suffix: {log_dir_suffix}") - - memory_monitors = [] - for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]: - memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0)) - memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix)) - memory_monitors.append(memory_monitor) - - core = ov.Core() - # core.set_property({"ENABLE_MMAP": "NO"}) - model = core.read_model(model_path) - - node_count_per_dtype = count_node_dtypes(model) - assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type" - node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True) - model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]] - - # Update input dtype based on model - input_dtype = input_dtype or model_dtype - - os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" - os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" - os.environ["INPUT_DTYPE"] = input_dtype - os.environ["RECOMPILE"] = f"{int(recompile)}" - os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}" - os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}" - - start_time = time.perf_counter() - if args.compression_mode == "int8_asym": - compression_mode = nncf.CompressWeightsMode.INT8_ASYM - elif args.compression_mode == "int8_sym": - compression_mode = nncf.CompressWeightsMode.INT8_SYM - elif args.compression_mode == "int4_asym": - compression_mode = nncf.CompressWeightsMode.INT4_ASYM - elif args.compression_mode == "int4_sym": - compression_mode = nncf.CompressWeightsMode.INT4_SYM - else: - raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}") - # TODO: Consider all_layers=True? - compressed_model = nncf.compress_weights(model, mode=compression_mode) - compression_time = time.perf_counter() - start_time - print(f"Compression Time: {compression_time:.2f} sec.") - - if save_model: - ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml") - for filepath in model_path.parent.glob("*.json"): - shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name)) - - del core - del model - del compressed_model - gc.collect() - time.sleep(0.5) - - before_cache_deletion = memory_monitors[2].get_data(True)[1][-1] - if not OV_MODEL_CACHE.is_empty(): - OV_MODEL_CACHE.clear() - gc.collect() - time.sleep(memory_monitors[0].interval * 10) - after_cache_deletion = memory_monitors[2].get_data(True)[1][-1] - else: - after_cache_deletion = before_cache_deletion - cache_size = before_cache_deletion - after_cache_deletion - print(f"Cache size: {cache_size:.2f} MiB") - - time.sleep(memory_monitors[0].interval * 10) - - leftover_memory = memory_monitors[2].get_data(True)[1][-1] - peak_memory = max(memory_monitors[2].get_data(True)[1]) - print(f"Peak memory: {peak_memory:.2f} MiB") - print(f"Leftover memory: {leftover_memory:.2f} MiB") - print("Done") - - csv_path = log_dir / "results.csv" - csv_exists = csv_path.exists() - csv_path.parent.mkdir(exist_ok=True, parents=True) - with open(csv_path, "a") as f: - if not csv_exists: - f.write( - "Model Path," - "Model dtype," - "Backend," - "Recompile," - "Release memory," - "Share outputs," - "Input Shapes," - "Input," - "Compression Time," - "Peak Memory," - "Cache Size," - "Leftover Memory" - "\n" - ) - f.write( - f"{model_path}," - f"{model_dtype.upper()}," - f"{'-' if numpy_compression else 'OV'}," - f"{'-' if numpy_compression else recompile}," - f"{'-' if numpy_compression else release_memory}," - f"{'-' if numpy_compression else share_outputs}," - f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'}," - f"{'-' if numpy_compression else input_dtype.upper()}," - f"{compression_time:.2f}," - f"{peak_memory:.2f}," - f"{cache_size:.2f}," - f"{leftover_memory:.2f}" - f"\n" - ) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) From dc30d8d94981b205437917242897b5e92e59bd8a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 31 Oct 2024 18:54:23 +0100 Subject: [PATCH 21/73] Tests WIP --- .../algorithms/weight_compression/config.py | 3 + .../weight_compression/openvino_backend.py | 5 +- .../weight_compression/openvino_modeling.py | 29 +- .../weight_compression/scale_estimation.py | 12 +- .../weight_compression/weight_lowering.py | 34 +- nncf/quantization/fake_quantize.py | 2 +- nncf/tensor/functions/ov.py | 27 +- .../quantization/test_openvino_modeling.py | 307 ++++++++++++++++++ 8 files changed, 376 insertions(+), 43 deletions(-) create mode 100644 tests/openvino/native/quantization/test_openvino_modeling.py diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py index 03590fc5ff3..85179df2fe4 100644 --- a/nncf/quantization/algorithms/weight_compression/config.py +++ b/nncf/quantization/algorithms/weight_compression/config.py @@ -54,6 +54,9 @@ def is_integer(self): def __hash__(self): return hash((self.mode.value, self.group_size)) + def __str__(self): + return f"{self.mode.value}_{self.group_size}" + @dataclass class WeightCompressionParameters: diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 24364c592d9..3a262bd5d12 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -46,6 +46,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorBackend @@ -127,6 +128,7 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov. def get_weight_dtype( self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph ) -> TensorDataType: + # TODO: use from nncf.tensor.functions.ov import DTYPE_MAP ov_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"] dtype_map = { "f16": TensorDataType.float16, @@ -277,7 +279,6 @@ def _create_compression_subgraph( if should_add_convert_node: mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert") - return mul, compressed_weight def transform_model( @@ -344,6 +345,8 @@ def transform_model( # reset name_to_node_mapping self.name_to_node_mapping = None + OV_MODEL_CACHE.clear() + return model @staticmethod diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index a1c99241b4c..9f2fed9e03e 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -36,7 +36,7 @@ class OVModelParameters: input_dtype: TensorDataType output_dtype: Optional[TensorDataType] = None - dynamic_shapes: bool = False + dynamic_shapes: bool = True # TODO: set to False once 156511 is resolved recompile: bool = False release_memory: bool = True share_inputs: bool = True @@ -124,7 +124,8 @@ def get_compress_decompress_weight_model( ) -> ModelCallable: if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) - scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) + if scale_shape is not None: + scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) if zero_point_shape is not None: zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) @@ -223,19 +224,18 @@ def _build_compress_model( else: dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4 + compressed_w = opset.round(compressed_w) compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high) compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights") ov_results = [compressed_w] - if len(ov_parameters) != 3: - # Two cases: - # 1. weight -> compressed_weight, scale, (zero_point) - # 2. weight, scale -> compressed_weight, (zero_point) - if len(ov_parameters) == 1: - ov_results.append(scale) - + if len(ov_parameters) == 1: + ov_results.append(scale) if zero_point is not None: - ov_results.append(opset.convert(zero_point, compressed_w.get_element_type())) + zero_point_dtype = compressed_w.get_element_type() if ov_model_params.return_ov_tensors else ov.Type.i32 + if zero_point.get_element_type() != zero_point_dtype: + zero_point = opset.convert(zero_point, zero_point_dtype) + ov_results.append(zero_point) if return_nodes: return ov_parameters, ov_results @@ -264,18 +264,13 @@ def _build_compress_decompress_model( if len(ov_parameters) == 1: # weight -> compressed_weight, scale, zero_point compressed_w, scale, zero_point = ov_results - elif len(ov_parameters) == 2: - # weight, scale -> compressed_weight, zero_point - compressed_w, zero_point = ov_results - scale = ov_parameters[1] else: # weight, scale, zero_point -> compressed_weight compressed_w = ov_results[0] scale, zero_point = ov_parameters[1:] - decompressed_w = scale * opset.convert( - opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32), ov.Type.f32 - ) + subtrac_zero_point = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32) + decompressed_w = scale * opset.convert(subtrac_zero_point, ov.Type.f32) else: if len(ov_parameters) == 1: # weight -> compressed_weight, scale diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 9e5fbb3d678..2e4c695b7f5 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -110,7 +110,7 @@ def apply( graph: NNCFGraph, statistic_points: Optional[StatisticPointsContainer] = None, dataset: Optional[Dataset] = None, - ) -> Dict[str, Tensor]: + ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]: """ Estimates better scale for the int4 nodes in the model. Minimizes per-group difference between floating point MatMul and @@ -122,10 +122,10 @@ def apply( :param graph: Model graph. :param statistic_points: Statistic points with collected statistics values. :param dataset: A representative dataset for the calibration process. - :return: Dict with pairs (weight name, estimated scale). + :return: Two dictionaries for estimated scales and zero points for each weight name. """ - scales = dict() + scales, zero_points = dict(), dict() for wp in track(self._all_weight_params, description="Applying Scale Estimation"): weight_name = wp.weight_name @@ -145,7 +145,7 @@ def apply( weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - scales[weight_name], _ = self.calculate_quantization_params( + scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params( self._backend_entity, stats, weight, @@ -157,7 +157,7 @@ def apply( self._weight_penalty, ) - return scales + return scales, zero_points @staticmethod def calculate_quantization_params( @@ -352,6 +352,8 @@ def calculate_quantization_params( if config.group_size == -1: result_scale = fns.squeeze(result_scale, axis=1) + if zp is not None and config.group_size == -1: + zp = fns.squeeze(zp, axis=1) return result_scale, zp diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index f65049e6dff..1aad39d5c5d 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -142,7 +142,9 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val= return scale -def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division=False) -> Tensor: +def calculate_signed_scale( + weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division: Optional[bool] = True +) -> Tensor: """ Calculates the signed scale for symmetric quantization. @@ -255,7 +257,10 @@ def calculate_normalized_weight_and_fp4_scale( def calculate_integer_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False + weight: Tensor, + reduction_axes: ReductionAxes, + config: WeightCompressionConfig, + invert_division: Optional[bool] = True, ) -> Tuple[Tensor, Tensor]: """ Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into @@ -291,7 +296,7 @@ def calculate_quantized_weight( config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None, - invert_division=False, + invert_division: Optional[bool] = True, ) -> Tensor: """ Quantizes the weight tensor using the provided scale and zero point. @@ -327,7 +332,10 @@ def calculate_quantized_weight( def get_integer_quantization_error( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False + weight: Tensor, + reduction_axes: ReductionAxes, + config: WeightCompressionConfig, + invert_division: Optional[bool] = True, ) -> float: """ Calculates a quantity characterizing the difference between floating point weights and fake quantized @@ -361,7 +369,7 @@ def compress_weight( config: WeightCompressionConfig, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, - invert_division=False, + invert_division: Optional[bool] = True, ): """ Compress weight using compression configuration. @@ -435,7 +443,7 @@ def do_int_quantization( reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, - invert_division: Optional[bool] = False, + invert_division: Optional[bool] = True, ov_model_params: Optional = None, ): """ @@ -453,6 +461,11 @@ def do_int_quantization( :return: A tuple containing the compressed weights, scale, and zero point tensors. """ assert config.is_integer, "The function supports integer quantization only" + if config.is_int_asym and (precomputed_scale is None) != (precomputed_zero_point is None): + raise ValueError( + "If precomputed quantization parameters are provided, both scale and zero point are required " + "for asymmetric quantization." + ) # import os accelerate_through_ov = ( @@ -528,11 +541,8 @@ def do_int_quantization( # Scale is always in fp32 so there is no need to store it in ov.Tensor if scale.backend == TensorBackend.ov: scale = scale.to_backend(TensorBackend.numpy) - elif precomputed_zero_point is None and config.is_int_asym: - # weight, scale -> compressed_weight, zero_point - compressed_weight, zero_point = model([weight, precomputed_scale]) - scale = precomputed_scale else: + # weight, scale, (zero_point) -> compressed_weight inputs = ( [weight, precomputed_scale] if precomputed_zero_point is None @@ -550,7 +560,7 @@ def calculate_quantized_dequantized_weight( reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, precomputed_zero_point: Optional[Tensor] = None, - invert_division: Optional[bool] = False, + invert_division: Optional[bool] = True, return_compressed_weight: Optional[bool] = False, ov_model_params: Optional = None, ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: @@ -606,7 +616,7 @@ def calculate_quantized_dequantized_weight( if precomputed_zero_point is not None: inputs.append(precomputed_zero_point) - compressed_weight, scale, zero_point = None, None, None + compressed_weight, scale, zero_point = None, precomputed_scale, precomputed_zero_point results = model(inputs) if len(results) == 1: decompressed_weight = results[0] diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py index a225f53853a..3e7cee04bc1 100644 --- a/nncf/quantization/fake_quantize.py +++ b/nncf/quantization/fake_quantize.py @@ -344,7 +344,7 @@ def calculate_scale_zero_point( level_low: int, level_high: int, narrow_range: bool, - invert_division: Optional[bool] = False, + invert_division: Optional[bool] = True, ) -> Tuple[Tensor, Tensor]: """ Calculates scale and zero_point values for the quantizer. diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index f8cd0431f83..b5083a4b284 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -17,7 +17,8 @@ from nncf.tensor.functions import numeric from ..definitions import TensorBackend -from .numpy_numeric import DTYPE_MAP as NP_DTYPE_MAP +from .numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP +from .numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP DTYPE_MAP = { TensorDataType.float16: ov.Type.f16, @@ -40,7 +41,6 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model a_dtype = DTYPE_MAP_REV[a.get_element_type()] - assert a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4] model = get_astype_model( OVModelParameters( @@ -65,11 +65,13 @@ def _(a: ov.Tensor) -> TensorBackend: @numeric.astype.register(ov.Tensor) def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: - a_dtype = DTYPE_MAP_REV[a.get_element_type()] - if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]: + if a.get_element_type() in [ov.Type.bf16, ov.Type.i4, ov.Type.u4] or dtype in [ + TensorDataType.bfloat16, + TensorDataType.int4, + TensorDataType.uint4, + ]: return _ov_astype(a, dtype) - - return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype])) + return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype])) @numeric.dtype.register(ov.Tensor) @@ -87,8 +89,19 @@ def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor: return ov.Tensor(a.data.reshape(shape), shape, a.get_element_type()) +@numeric.to_backend.register(np.ndarray) +def _(a: np.ndarray, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]: + if b == TensorBackend.numpy: + return a + if b != TensorBackend.ov: + raise ValueError("Not supported backend") + return ov.Tensor(a, a.shape, DTYPE_MAP[DTYPE_MAP_REV_NP[a.dtype]]) + + @numeric.to_backend.register(ov.Tensor) -def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray: +def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]: + if b == TensorBackend.ov: + return a if b != TensorBackend.numpy: raise ValueError("Not supported backend") diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py new file mode 100644 index 00000000000..872173c0990 --- /dev/null +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -0,0 +1,307 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +from collections import defaultdict +from contextlib import contextmanager +from enum import Enum +from unittest.mock import patch + +import numpy as np +import openvino as ov +import pytest + +from nncf import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization +from nncf.results_caching import ResultsCacheContainer +from nncf.results_caching import cache_results +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType +from nncf.tensor.definitions import TensorBackend +from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP +from nncf.tensor.functions.numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP +from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV + + +class ComputationBackend(Enum): + NumPy = "numpy" + OV = "ov" + + +class QuantizationTask(Enum): + Q = "quantize" + Q_DQ = "quantize_dequantize" + Q_DQ_RQ = "quantize_dequantize_return_quantized" + + +COMPRESSION_CONFIGS = [ + WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + WeightCompressionConfig(CompressWeightsMode.INT8_SYM), + WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), + WeightCompressionConfig(CompressWeightsMode.INT4_SYM), + WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, group_size=2), + WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2), +] + + +DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] + +WEIGHT_SHAPE = (1000, 4) + +TENSOR_BACKENDS = [TensorBackend.numpy, TensorBackend.ov] + +reduction_axes = (1,) + + +RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer() + + +@cache_results(RANDOM_TENSOR_CACHE_CONTAINER) +def get_random_float_tensor(shape, dtype, backend, seed=0): + np.random.seed(seed) + data = np.random.normal(size=shape) + data = data.astype(np.float16 if dtype == TensorDataType.float16 else np.float32) + + if backend == TensorBackend.ov or dtype == TensorDataType.bfloat16: + data = Tensor(ov.Tensor(data, shape, DTYPE_MAP_OV[DTYPE_MAP_REV_NP[data.dtype]])) + if dtype == TensorDataType.bfloat16: + data = data.astype(TensorDataType.bfloat16) + if backend == TensorBackend.numpy: + data = data.to_backend(TensorBackend.numpy) if dtype == TensorDataType.bfloat16 else Tensor(data) + return Tensor(data) + + +@cache_results(RANDOM_TENSOR_CACHE_CONTAINER) +def get_random_integer_tensor(shape, low, high, dtype, backend, seed=0): + np.random.seed(seed) + data = np.random.randint(low, high, size=shape).astype(DTYPE_MAP_NP[dtype]) + if backend == TensorBackend.ov: + data = ov.Tensor(data, shape, DTYPE_MAP_OV[dtype]) + return Tensor(data) + + +@contextmanager +def openvino_available(available: bool): + import nncf.utils + + original_value = nncf.utils._openvino_available + nncf.utils._openvino_available = available + yield + nncf.utils._openvino_available = original_value + + +@pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) +# @pytest.mark.parametrize("config", [WeightCompressionConfig(CompressWeightsMode.INT8_ASYM)]) +@pytest.mark.parametrize( + ("quantization_task", "tensor_backend"), + [ + (QuantizationTask.Q, TensorBackend.numpy), + (QuantizationTask.Q, "auto"), + (QuantizationTask.Q, TensorBackend.ov), + (QuantizationTask.Q_DQ, TensorBackend.numpy), + (QuantizationTask.Q_DQ, "auto"), + (QuantizationTask.Q_DQ_RQ, TensorBackend.numpy), + (QuantizationTask.Q_DQ_RQ, "auto"), + ], +) +@pytest.mark.parametrize("dtype", DATA_TYPES) +@pytest.mark.parametrize("precompute", [False, True]) +def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute): + d1, d2 = WEIGHT_SHAPE + group_size = config.group_size + zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1) + level_low, level_high = 0, 2**config.num_bits - 1 + + results = defaultdict(dict) + # Iterate over two implementations + for cb in [ComputationBackend.NumPy, ComputationBackend.OV]: + # A context manager to enable/disable ov implementation + with openvino_available(cb == ComputationBackend.OV): + # OV tensor backend for weight is only supported for quantization task + if quantization_task == QuantizationTask.Q and ( + tensor_backend == TensorBackend.ov or cb == ComputationBackend.OV and tensor_backend == "auto" + ): + weight_tensor_backend = TensorBackend.ov + else: + weight_tensor_backend = TensorBackend.numpy + + # Generate input tensors + weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, weight_tensor_backend) + precomputed_scale, precomputed_zero_point = None, None + if precompute: + # For precomputed mode, the weight is assumed to be already reshaped + if group_size != -1: + weight, _ = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) + + precomputed_scale = get_random_float_tensor(scale_shape, TensorDataType.float32, TensorBackend.numpy) + if config.is_int_asym: + precomputed_zero_point = get_random_integer_tensor( + zero_point_shape, level_low, level_high, TensorDataType.int32, TensorBackend.numpy + ) + + if quantization_task == QuantizationTask.Q: + fn_to_call = do_int_quantization + fn_to_patch = get_compress_weight_model + else: + fn_to_call = calculate_quantized_dequantized_weight + fn_to_patch = get_compress_decompress_weight_model + patch_path = f"{inspect.getmodule(fn_to_patch).__name__}.{fn_to_patch.__name__}" + with patch(patch_path, side_effect=fn_to_patch) as mock: + # For precomputed mode, all inputs are assumed to be already reshaped + r_axes = None if precompute else reduction_axes + kwargs = {"return_compressed_weight": True} if quantization_task == QuantizationTask.Q_DQ_RQ else {} + outputs = fn_to_call(weight, config, r_axes, precomputed_scale, precomputed_zero_point, **kwargs) + + decompressed_weight, compressed_weight, scale, zero_point = (None,) * 4 + if quantization_task == QuantizationTask.Q: + compressed_weight, scale, zero_point = outputs + elif quantization_task == QuantizationTask.Q_DQ: + decompressed_weight = outputs[0] + else: + decompressed_weight, compressed_weight, scale, zero_point = outputs + + if cb == ComputationBackend.NumPy: + mock.assert_not_called() + else: + mock.assert_called_once() + + if quantization_task != QuantizationTask.Q_DQ: + # Scale should always be float32 and numpy backend + assert scale.dtype == TensorDataType.float32 + assert scale.backend == TensorBackend.numpy + if precompute: + # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones + np.testing.assert_allclose(precomputed_scale.data, scale.data) + if config.is_int_asym: + np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data) + + if ( + quantization_task == QuantizationTask.Q + and cb == ComputationBackend.OV + and weight_tensor_backend == TensorBackend.ov + and config.num_bits == 4 + ): + # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed + # zero point must be in ov backend and have (u)int4 dtype in order to be able to insert them into OV model + # without re-packing + assert compressed_weight.backend == TensorBackend.ov + assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4) + if config.is_int_asym and not precompute: + assert zero_point.backend == TensorBackend.ov + assert zero_point.dtype == TensorDataType.uint4 + else: + if quantization_task != QuantizationTask.Q_DQ: + # Otherwise compressed weight and zero point must be returned in numpy backend, compressed weight must + # be of (u)int8 data type, zero point -- in int32 + assert compressed_weight.backend == TensorBackend.numpy + assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8) + if config.is_int_asym and not precompute: + assert zero_point.backend == TensorBackend.numpy + assert zero_point.dtype == TensorDataType.int32 + if quantization_task != QuantizationTask.Q: + assert decompressed_weight.backend == TensorBackend.numpy + assert decompressed_weight.dtype == TensorDataType.float32 + + # Save results for comparison between implementations + if quantization_task != QuantizationTask.Q: + results[cb]["decompressed_weight"] = decompressed_weight + if quantization_task != QuantizationTask.Q_DQ: + results[cb]["compressed_weight"] = compressed_weight.to_backend(TensorBackend.numpy) + results[cb]["scale"] = scale + if config.is_int_asym: + results[cb]["zero_point"] = zero_point.to_backend(TensorBackend.numpy) + + keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy])) + # Check that the computed tensors are equal between implementations + for key in keys: + numpy_result = results[ComputationBackend.NumPy][key].data + ov_result = results[ComputationBackend.OV][key].data + np.testing.assert_allclose(numpy_result, ov_result, err_msg=f"Results do not align for {key}.") + + +# @pytest.mark.parametrize("mode", COMPRESSION_MODES) +# @pytest.mark.parametrize("group_size", [2]) +# def test_grouped_quantization(mode, group_size): +# if mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM]: +# pytest.skip("Group size is not applicable for INT8 modes") +# +# # Generate random weight tensor +# weight_shape = (128, 4) +# weight = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy) +# +# # Create WeightCompressionConfig +# config = WeightCompressionConfig(mode, group_size=group_size) +# +# # Patch is_openvino_available to control the implementation +# with patch("nncf.utils.is_openvino_available", return_value=False): +# # Reference implementation +# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes) +# +# with patch("nncf.utils.is_openvino_available", return_value=True): +# # OpenVINO implementation +# ov_model_params = OVModelParameters(weight.dtype) +# decompressed_weight_ov = calculate_quantized_dequantized_weight( +# weight, config, reduction_axes, ov_model_params=ov_model_params +# ) +# +# # Compare decompressed weights +# np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4) +# +# +# def test_weight_dtypes(): +# # Test different weight data types +# weight_shape = (128, 4) +# for dtype in DATA_TYPES: +# weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy) +# config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM) +# +# # Reference implementation +# with patch("nncf.utils.is_openvino_available", return_value=False): +# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes) +# +# # OpenVINO implementation +# with patch("nncf.utils.is_openvino_available", return_value=True): +# ov_model_params = OVModelParameters(weight.dtype) +# decompressed_weight_ov = calculate_quantized_dequantized_weight( +# weight, config, reduction_axes, ov_model_params=ov_model_params +# ) +# +# # Compare decompressed weights +# np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4) +# +# +# def test_tensor_backends(): +# # Test different tensor backends for do_int_quantization +# weight_shape = (128, 4) +# weight_numpy = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy) +# weight_ov = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.ov) +# config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM) +# +# # Reference implementation with numpy backend +# with patch("nncf.utils.is_openvino_available", return_value=False): +# compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, reduction_axes) +# +# # OpenVINO implementation with OV backend +# with patch("nncf.utils.is_openvino_available", return_value=True): +# ov_model_params = OVModelParameters(weight_ov.dtype) +# compressed_weight_ov, scale_ov = do_int_quantization( +# weight_ov, config, reduction_axes, ov_model_params=ov_model_params +# ) +# +# # Compare compressed weights +# np.testing.assert_allclose(compressed_weight_ref.data, compressed_weight_ov.data, atol=1e-5, rtol=1e-4) +# +# # Compare scales +# np.testing.assert_allclose(scale_ref.data, scale_ov.data, atol=1e-5, rtol=1e-4) From c5606cec5ea03f8bcfbb4ff8a395cdcc205bfc8a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 1 Nov 2024 17:32:28 +0100 Subject: [PATCH 22/73] Invert Tensor division --- nncf/tensor/tensor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nncf/tensor/tensor.py b/nncf/tensor/tensor.py index 1f776e19ad6..5db49985d4e 100644 --- a/nncf/tensor/tensor.py +++ b/nncf/tensor/tensor.py @@ -116,6 +116,12 @@ def __ipow__(self, other: Union[Tensor, float]) -> Tensor: self._data **= unwrap_tensor_data(other) return self + # def __truediv__(self, other: Union[Tensor, float]) -> Tensor: + # return self * _call_function("_binary_op_nowarn", 1.0, other, operator.truediv) + # + # def __rtruediv__(self, other: Union[Tensor, float]) -> Tensor: + # return other * _call_function("_binary_reverse_op_nowarn", self, 1.0, operator.truediv) + def __truediv__(self, other: Union[Tensor, float]) -> Tensor: return _call_function("_binary_op_nowarn", self, other, operator.truediv) From e6a9d56e2f6e0f9ef9d3d0764114f307747c42ca Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 4 Nov 2024 20:21:28 +0100 Subject: [PATCH 23/73] Add fns.divide --- .../weight_compression/weight_lowering.py | 39 +++++-------------- nncf/quantization/fake_quantize.py | 13 ++----- nncf/tensor/functions/__init__.py | 2 + nncf/tensor/functions/numeric.py | 35 +++++++++++++++++ 4 files changed, 50 insertions(+), 39 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 1aad39d5c5d..20c4b3e539a 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -142,9 +142,7 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val= return scale -def calculate_signed_scale( - weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division: Optional[bool] = True -) -> Tensor: +def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4) -> Tensor: """ Calculates the signed scale for symmetric quantization. @@ -159,10 +157,7 @@ def calculate_signed_scale( w_max = fns.max(weight, axis=reduction_axes, keepdims=True) scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max) - if invert_division: - scale *= 1.0 / level_high - else: - scale /= level_high + fns.inplace_divide(scale, level_high) eps = fns.finfo(scale).eps scale = fns.where(fns.abs(scale) < eps, eps, scale) @@ -183,7 +178,7 @@ def calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor: if scale.dtype != TensorDataType.float32: scale = scale.astype(TensorDataType.float32) - return weight / scale + return fns.divide(weight, scale) def do_nf4_quantization(weight: Tensor, scale: Tensor, is_normalized_weight: bool = False) -> Tensor: @@ -260,7 +255,6 @@ def calculate_integer_quantization_params( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, - invert_division: Optional[bool] = True, ) -> Tuple[Tensor, Tensor]: """ Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into @@ -283,7 +277,7 @@ def calculate_integer_quantization_params( min_values = fns.min(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] max_values = fns.max(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] scale, zero_point = calculate_scale_zero_point( - min_values, max_values, level_low, level_high, narrow_range=False, invert_division=invert_division + min_values, max_values, level_low, level_high, narrow_range=False ) return scale, zero_point @@ -296,7 +290,6 @@ def calculate_quantized_weight( config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None, - invert_division: Optional[bool] = True, ) -> Tensor: """ Quantizes the weight tensor using the provided scale and zero point. @@ -305,7 +298,6 @@ def calculate_quantized_weight( :param config: Weight compression configuration. :param scale: Scale tensor used for quantization. :param zero_point: Zero point tensor used for quantization. - :param invert_division: applies inversion for scale and then multiply by weights instead of division. :return: Quantized weight tensor of uint8 or int8 type. """ if weight.dtype != TensorDataType.float32: @@ -319,10 +311,7 @@ def calculate_quantized_weight( level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - if invert_division: - compressed_weights = weight * (1.0 / scale) - else: - compressed_weights = weight / scale + compressed_weights = fns.divide(weight, scale) if zero_point is not None: compressed_weights += zero_point.astype(weight.dtype) compressed_weights = fns.round(compressed_weights) @@ -335,7 +324,6 @@ def get_integer_quantization_error( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, - invert_division: Optional[bool] = True, ) -> float: """ Calculates a quantity characterizing the difference between floating point weights and fake quantized @@ -351,9 +339,7 @@ def get_integer_quantization_error( if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - compressed_weights, scale, zero_point = do_int_quantization( - weight, config, reduction_axes, invert_division=invert_division - ) + compressed_weights, scale, zero_point = do_int_quantization(weight, config, reduction_axes) decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point) decompressed_weight = decompressed_weight.reshape(orig_shape) @@ -369,7 +355,6 @@ def compress_weight( config: WeightCompressionConfig, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, - invert_division: Optional[bool] = True, ): """ Compress weight using compression configuration. @@ -390,7 +375,7 @@ def compress_weight( ) return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_int_quantization( - weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division=invert_division + weight, config, reduction_axes, precomputed_scale, precomputed_zero_point ) return CompressedWeight(compressed_weight, scale, zero_point) @@ -443,7 +428,6 @@ def do_int_quantization( reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, - invert_division: Optional[bool] = True, ov_model_params: Optional = None, ): """ @@ -455,8 +439,6 @@ def do_int_quantization( precomputed scale (and zero point) are provided. :param precomputed_scale: Optional precomputed scale tensor. :param precomputed_zero_point: Optional precomputed zero point tensor. - :param invert_division: Whether to apply inversion for scale and then multiply by weights instead of division. - Defaults to False. :param ov_model_params: OpenVINO model parameters for acceleration. :return: A tuple containing the compressed weights, scale, and zero point tensors. """ @@ -498,7 +480,7 @@ def do_int_quantization( if precomputed_zero_point is not None: zero_point = precomputed_zero_point - compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) + compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point) return compressed_weights, scale, zero_point from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters @@ -560,7 +542,6 @@ def calculate_quantized_dequantized_weight( reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, precomputed_zero_point: Optional[Tensor] = None, - invert_division: Optional[bool] = True, return_compressed_weight: Optional[bool] = False, ov_model_params: Optional = None, ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: @@ -577,12 +558,12 @@ def calculate_quantized_dequantized_weight( # Reference implementation if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None): compressed_weight, scale, zero_point = do_int_quantization( - weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division + weight, config, reduction_axes, precomputed_scale, precomputed_zero_point ) else: scale = precomputed_scale if precomputed_scale is not None else None zero_point = precomputed_zero_point if precomputed_zero_point is not None else None - compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division) + compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point) decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) if return_compressed_weight: return decompressed_weight, compressed_weight, scale, zero_point diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py index 3e7cee04bc1..385cef9ca2e 100644 --- a/nncf/quantization/fake_quantize.py +++ b/nncf/quantization/fake_quantize.py @@ -11,7 +11,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Tuple import nncf from nncf.common.quantization.quantizers import calculate_asymmetric_level_ranges @@ -344,7 +344,6 @@ def calculate_scale_zero_point( level_low: int, level_high: int, narrow_range: bool, - invert_division: Optional[bool] = True, ) -> Tuple[Tensor, Tensor]: """ Calculates scale and zero_point values for the quantizer. @@ -360,17 +359,11 @@ def calculate_scale_zero_point( :return: Scale and Zero point values. """ levels = level_high - level_low if narrow_range else level_high - level_low + 1 - if invert_division: - scale = ((input_high - input_low) * (1.0 / (levels - 1))).astype(TensorDataType.float32) - else: - scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32) + scale = fns.divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32) eps = fns.finfo(scale).eps # NOTE: adding machine epsilon to avoid division by zero scale = fns.where(fns.abs(scale) < eps, eps, scale) expected_level_low = level_low + 1 if narrow_range else level_low - if invert_division: - zero_point = expected_level_low - fns.round(input_low * (1.0 / scale)) - else: - zero_point = expected_level_low - fns.round(input_low / scale) + zero_point = expected_level_low - fns.round(fns.divide(input_low, scale)) zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high) return scale, zero_point diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py index 9affab79c90..52bc666dfa3 100644 --- a/nncf/tensor/functions/__init__.py +++ b/nncf/tensor/functions/__init__.py @@ -24,12 +24,14 @@ from nncf.tensor.functions.numeric import count_nonzero as count_nonzero from nncf.tensor.functions.numeric import device as device from nncf.tensor.functions.numeric import diag as diag +from nncf.tensor.functions.numeric import divide as divide from nncf.tensor.functions.numeric import dtype as dtype from nncf.tensor.functions.numeric import expand_dims as expand_dims from nncf.tensor.functions.numeric import eye as eye from nncf.tensor.functions.numeric import finfo as finfo from nncf.tensor.functions.numeric import flatten as flatten from nncf.tensor.functions.numeric import from_numpy as from_numpy +from nncf.tensor.functions.numeric import inplace_divide as inplace_divide from nncf.tensor.functions.numeric import isclose as isclose from nncf.tensor.functions.numeric import isempty as isempty from nncf.tensor.functions.numeric import item as item diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py index cdec5788bf6..c6276a5e22f 100644 --- a/nncf/tensor/functions/numeric.py +++ b/nncf/tensor/functions/numeric.py @@ -911,3 +911,38 @@ def ceil(a: Tensor) -> Tensor: @tensor_guard def to_backend(a: Tensor, b: TensorBackend) -> Tensor: return Tensor(to_backend(a.data, b)) + + +@functools.singledispatch +@tensor_guard +def divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor: + """ + Divide two tensors or a tensor and a float. + This function divides `a` by `b`. If `invert` is True, it performs the division as `a * (1.0 / b)`. + Otherwise, it performs the division as `a / b`. + :param a: The first input tensor or float. + :param b: The second input tensor or float. + :param invert: If True, the division is performed as `a * (1.0 / b)`. If False, it is performed as `a / b`. + Defaults to True. + :return: A new tensor resulting from the division. + """ + return Tensor(a * (1.0 / b) if invert else a / b) + + +@functools.singledispatch +@tensor_guard +def inplace_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None: + """ + In-place division of two tensors or a tensor and a float. + This function divides `a` by `b` in place. If `invert` is True, it performs the division as `a *= (1.0 / b)`. + Otherwise, it performs the division as `a /= b`. + :param a: The first input tensor or float. + :param b: The second input tensor or float. + :param invert: If True, the division is performed as `a *= (1.0 / b)`. If False, the division it is as `a /= b`. + Defaults to True. + :return: None. The operation is performed in place. + """ + if invert: + a *= 1.0 / b + else: + a /= b From ab90a089738ed06f4687fd05d58db0dbe1ff1798 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 6 Nov 2024 18:48:46 +0100 Subject: [PATCH 24/73] Adopt misalignment test to check the degree of misalignment --- .../weight_compression/openvino_modeling.py | 6 +- nncf/tensor/functions/ov.py | 2 +- .../quantization/test_openvino_modeling.py | 124 ++++++++++++++---- 3 files changed, 102 insertions(+), 30 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 9f2fed9e03e..d98fbde9324 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -36,7 +36,7 @@ class OVModelParameters: input_dtype: TensorDataType output_dtype: Optional[TensorDataType] = None - dynamic_shapes: bool = True # TODO: set to False once 156511 is resolved + dynamic_shapes: bool = False recompile: bool = False release_memory: bool = True share_inputs: bool = True @@ -269,8 +269,8 @@ def _build_compress_decompress_model( compressed_w = ov_results[0] scale, zero_point = ov_parameters[1:] - subtrac_zero_point = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32) - decompressed_w = scale * opset.convert(subtrac_zero_point, ov.Type.f32) + compressed_w_ = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32) + decompressed_w = scale * opset.convert(compressed_w_, ov.Type.f32) else: if len(ov_parameters) == 1: # weight -> compressed_weight, scale diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index b5083a4b284..483aac9bf6b 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -52,7 +52,7 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: share_outputs=True, return_ov_tensors=True, ), - a.shape, + tuple(a.shape), dtype, ) return model([a])[0].data diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py index 872173c0990..b9720def2ac 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -20,6 +20,7 @@ from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight @@ -29,6 +30,7 @@ from nncf.results_caching import cache_results from nncf.tensor import Tensor from nncf.tensor import TensorDataType +from nncf.tensor import functions as fns from nncf.tensor.definitions import TensorBackend from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP from nncf.tensor.functions.numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP @@ -58,11 +60,21 @@ class QuantizationTask(Enum): DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] -WEIGHT_SHAPE = (1000, 4) +WEIGHT_SHAPE = (10000, 4) + +MAX_MISALIGNMENT_FREQUENCY = { + TensorDataType.float32: 1e-2, # tends to < 5e-6 + TensorDataType.float16: 1e-2, # tends to < 5e-5 + TensorDataType.bfloat16: 1e-2, # tends to < 5e-4 +} + +MAX_MISALIGNMENT_MAGNITUDE = 1 TENSOR_BACKENDS = [TensorBackend.numpy, TensorBackend.ov] -reduction_axes = (1,) +EPS = np.finfo(np.float32).eps + +REDUCTION_AXES = (1,) RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer() @@ -103,12 +115,12 @@ def openvino_available(available: bool): @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) -# @pytest.mark.parametrize("config", [WeightCompressionConfig(CompressWeightsMode.INT8_ASYM)]) @pytest.mark.parametrize( ("quantization_task", "tensor_backend"), [ (QuantizationTask.Q, TensorBackend.numpy), (QuantizationTask.Q, "auto"), + # Only for quantization task NumPy backend should support OV tensors as inputs (QuantizationTask.Q, TensorBackend.ov), (QuantizationTask.Q_DQ, TensorBackend.numpy), (QuantizationTask.Q_DQ, "auto"), @@ -117,8 +129,9 @@ def openvino_available(available: bool): ], ) @pytest.mark.parametrize("dtype", DATA_TYPES) -@pytest.mark.parametrize("precompute", [False, True]) -def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute): +@pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"]) +@pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"]) +def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes): d1, d2 = WEIGHT_SHAPE group_size = config.group_size zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1) @@ -140,10 +153,10 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype # Generate input tensors weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, weight_tensor_backend) precomputed_scale, precomputed_zero_point = None, None - if precompute: - # For precomputed mode, the weight is assumed to be already reshaped + if precompute_s_zp: + # When scale (and z.p) are precomputed, all inputs are assumed to be reshaped beforehand if group_size != -1: - weight, _ = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) + weight, _ = reshape_weight_for_grouped_quantization(weight, REDUCTION_AXES, group_size) precomputed_scale = get_random_float_tensor(scale_shape, TensorDataType.float32, TensorBackend.numpy) if config.is_int_asym: @@ -159,16 +172,26 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype fn_to_patch = get_compress_decompress_weight_model patch_path = f"{inspect.getmodule(fn_to_patch).__name__}.{fn_to_patch.__name__}" with patch(patch_path, side_effect=fn_to_patch) as mock: - # For precomputed mode, all inputs are assumed to be already reshaped - r_axes = None if precompute else reduction_axes - kwargs = {"return_compressed_weight": True} if quantization_task == QuantizationTask.Q_DQ_RQ else {} - outputs = fn_to_call(weight, config, r_axes, precomputed_scale, precomputed_zero_point, **kwargs) + # When scale (and z.p) are precomputed, all inputs are assumed to be already reshaped and reduction + # axes are not needed + reduction_axes = None if precompute_s_zp else REDUCTION_AXES + + kwargs = {} + if cb == ComputationBackend.OV: + ov_model_params = OVModelParameters(weight.dtype, dynamic_shapes=not static_shapes) + kwargs["ov_model_params"] = ov_model_params + if quantization_task == QuantizationTask.Q_DQ_RQ: + kwargs["return_compressed_weight"] = True + + outputs = fn_to_call( + weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, **kwargs + ) decompressed_weight, compressed_weight, scale, zero_point = (None,) * 4 if quantization_task == QuantizationTask.Q: compressed_weight, scale, zero_point = outputs elif quantization_task == QuantizationTask.Q_DQ: - decompressed_weight = outputs[0] + decompressed_weight = outputs else: decompressed_weight, compressed_weight, scale, zero_point = outputs @@ -181,7 +204,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype # Scale should always be float32 and numpy backend assert scale.dtype == TensorDataType.float32 assert scale.backend == TensorBackend.numpy - if precompute: + if precompute_s_zp: # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones np.testing.assert_allclose(precomputed_scale.data, scale.data) if config.is_int_asym: @@ -198,7 +221,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype # without re-packing assert compressed_weight.backend == TensorBackend.ov assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4) - if config.is_int_asym and not precompute: + if config.is_int_asym and not precompute_s_zp: assert zero_point.backend == TensorBackend.ov assert zero_point.dtype == TensorDataType.uint4 else: @@ -207,7 +230,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype # be of (u)int8 data type, zero point -- in int32 assert compressed_weight.backend == TensorBackend.numpy assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8) - if config.is_int_asym and not precompute: + if config.is_int_asym and not precompute_s_zp: assert zero_point.backend == TensorBackend.numpy assert zero_point.dtype == TensorDataType.int32 if quantization_task != QuantizationTask.Q: @@ -224,11 +247,60 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype results[cb]["zero_point"] = zero_point.to_backend(TensorBackend.numpy) keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy])) - # Check that the computed tensors are equal between implementations for key in keys: - numpy_result = results[ComputationBackend.NumPy][key].data - ov_result = results[ComputationBackend.OV][key].data - np.testing.assert_allclose(numpy_result, ov_result, err_msg=f"Results do not align for {key}.") + numpy_result = results[ComputationBackend.NumPy][key] + ov_result = results[ComputationBackend.OV][key] + + atol = 0 + scale = None + # For static-shaped OV models doing asymmetric compression there maybe misalignments between OV and NumPy + # For more details see 156511 + if static_shapes and config.is_int_asym: + if key == "compressed_weight": + atol = MAX_MISALIGNMENT_MAGNITUDE + elif key == "decompressed_weight": + if "scale" in results[ComputationBackend.NumPy]: + scale = results[ComputationBackend.NumPy]["scale"] + else: + if precompute_s_zp: + scale = precomputed_scale + else: + weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, TensorBackend.numpy) + with openvino_available(False): + _, _, scale, _ = calculate_quantized_dequantized_weight( + weight, config, REDUCTION_AXES, return_compressed_weight=True + ) + # For decompressed weight the misalignment magnitude depends on the scale + atol = MAX_MISALIGNMENT_MAGNITUDE * fns.abs(scale).max().item() + EPS + max_misalignment_frequency = MAX_MISALIGNMENT_FREQUENCY[dtype] + else: + max_misalignment_frequency = None + + # Check that the computed tensors are equal between implementations + np.testing.assert_allclose( + numpy_result.data, ov_result.data, atol=atol, err_msg=f"Results do not align for {key}." + ) + + if max_misalignment_frequency is not None: + if key == "compressed_weight": + diff = fns.abs(numpy_result.astype(TensorDataType.int32) - ov_result.astype(TensorDataType.int32)) + else: + diff = fns.abs(numpy_result - ov_result) + + if diff.max() > 0: + # Check that the proportion of misaligned values is small + n_not_equal = fns.sum(diff > 0) + assert n_not_equal / numpy_result.size < max_misalignment_frequency + + # Check that the magnitude of misalignment is as small as expected + if key == "decompressed_weight": + # Reshape scale to match the shape of decompressed weight + scale = np.repeat(scale.data, diff.shape[-1], axis=-1) + np.testing.assert_array_less( + diff.data, + MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS, + err_msg=f"Too large misalignment for {key}.", + ) # @pytest.mark.parametrize("mode", COMPRESSION_MODES) @@ -247,13 +319,13 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype # # Patch is_openvino_available to control the implementation # with patch("nncf.utils.is_openvino_available", return_value=False): # # Reference implementation -# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes) +# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES) # # with patch("nncf.utils.is_openvino_available", return_value=True): # # OpenVINO implementation # ov_model_params = OVModelParameters(weight.dtype) # decompressed_weight_ov = calculate_quantized_dequantized_weight( -# weight, config, reduction_axes, ov_model_params=ov_model_params +# weight, config, REDUCTION_AXES, ov_model_params=ov_model_params # ) # # # Compare decompressed weights @@ -269,13 +341,13 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype # # # Reference implementation # with patch("nncf.utils.is_openvino_available", return_value=False): -# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes) +# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES) # # # OpenVINO implementation # with patch("nncf.utils.is_openvino_available", return_value=True): # ov_model_params = OVModelParameters(weight.dtype) # decompressed_weight_ov = calculate_quantized_dequantized_weight( -# weight, config, reduction_axes, ov_model_params=ov_model_params +# weight, config, REDUCTION_AXES, ov_model_params=ov_model_params # ) # # # Compare decompressed weights @@ -291,13 +363,13 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype # # # Reference implementation with numpy backend # with patch("nncf.utils.is_openvino_available", return_value=False): -# compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, reduction_axes) +# compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, REDUCTION_AXES) # # # OpenVINO implementation with OV backend # with patch("nncf.utils.is_openvino_available", return_value=True): # ov_model_params = OVModelParameters(weight_ov.dtype) # compressed_weight_ov, scale_ov = do_int_quantization( -# weight_ov, config, reduction_axes, ov_model_params=ov_model_params +# weight_ov, config, REDUCTION_AXES, ov_model_params=ov_model_params # ) # # # Compare compressed weights From 6289c5cb6bd172a3b47b487be9a7be1840e619c6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 7 Nov 2024 14:56:07 +0100 Subject: [PATCH 25/73] Merge-related fixes --- .../native/quantization/test_openvino_modeling.py | 2 +- .../native/quantization/test_weights_compression.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py index b9720def2ac..6d82885dd0f 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -120,7 +120,7 @@ def openvino_available(available: bool): [ (QuantizationTask.Q, TensorBackend.numpy), (QuantizationTask.Q, "auto"), - # Only for quantization task NumPy backend should support OV tensors as inputs + # NumPy backend should support OV tensors as inputs only for quantization task (QuantizationTask.Q, TensorBackend.ov), (QuantizationTask.Q_DQ, TensorBackend.numpy), (QuantizationTask.Q_DQ, "auto"), diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 239a94eaf63..f187b34961f 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1062,8 +1062,6 @@ def test_compressed_weighs_range(mode, data): ], ) def test_int_quantization_with_precomputed_parameters(config, precompute_scale, precompute_zero_point, raises): - is_asym = config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT8_ASYM] - precomputed_scale, precomputed_zero_point = None, None weight = Tensor(((np.arange(11) - 5) / 10).astype(np.float32)[:, None]) if precompute_scale: @@ -1073,18 +1071,18 @@ def test_int_quantization_with_precomputed_parameters(config, precompute_scale, if raises: with pytest.raises(ValueError) as exc_info: - _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point) + _, scale, zero_point = do_int_quantization(weight, config, -1, precomputed_scale, precomputed_zero_point) assert exc_info.value == ( "If precomputed quantization parameters are provided, both scale and zero point " "are required for asymmetric quantization." ) return else: - _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point) + _, scale, zero_point = do_int_quantization(weight, config, -1, precomputed_scale, precomputed_zero_point) if precompute_scale: assert np.allclose(scale.data, precomputed_scale.data) - if is_asym: + if config.is_int_asym: if precompute_zero_point: assert np.allclose(zero_point.data, precomputed_zero_point.data) else: From f60fd177e56ccce6f67fae1db2b1708bdce1ca52 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 7 Nov 2024 16:33:18 +0100 Subject: [PATCH 26/73] Tweaks --- .../weight_compression/openvino_modeling.py | 16 ++-- nncf/tensor/functions/ov.py | 3 +- .../quantization/test_openvino_modeling.py | 89 ++----------------- 3 files changed, 16 insertions(+), 92 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index d98fbde9324..a2092604b1a 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -23,7 +23,7 @@ from nncf.results_caching import cache_results from nncf.tensor import Tensor from nncf.tensor import TensorDataType -from nncf.tensor.functions.ov import DTYPE_MAP as OV_DTYPE_MAP +from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV TensorList = List[Tensor] ModelCallable = Callable[[TensorList], TensorList] @@ -61,9 +61,7 @@ def __hash__(self): def run_model( ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList ) -> TensorList: - if any(isinstance(it, Tensor) for it in inputs): - inputs = [inp.data for inp in inputs] - + inputs = [inp.data for inp in inputs] if return_ov_tensors: infer_request = compiled_model.create_infer_request() infer_request.infer( @@ -151,7 +149,7 @@ def _build_compress_model( reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, ) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]: - weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype]) + weight = opset.parameter(weight_shape, name="w", dtype=DTYPE_MAP_OV[ov_model_params.input_dtype]) ov_parameters = [weight] num_bits = config.num_bits @@ -214,13 +212,13 @@ def _build_compress_model( if config.is_int_asym: if ov_model_params.output_dtype is not None: - dtype = OV_DTYPE_MAP[ov_model_params.output_dtype] + dtype = DTYPE_MAP_OV[ov_model_params.output_dtype] else: dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 compressed_w += zero_point else: if ov_model_params.output_dtype is not None: - dtype = OV_DTYPE_MAP[ov_model_params.output_dtype] + dtype = DTYPE_MAP_OV[ov_model_params.output_dtype] else: dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4 @@ -296,8 +294,8 @@ def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype @cache_results(OV_MODEL_CACHE) def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable: - arg = opset.parameter(arg_shape, dtype=OV_DTYPE_MAP[ov_model_params.input_dtype]) - res = opset.convert(arg, OV_DTYPE_MAP[dtype]) + arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[ov_model_params.input_dtype]) + res = opset.convert(arg, DTYPE_MAP_OV[dtype]) model = ov.Model([res], [arg]) compiled_model = ov.compile_model(model, device_name="CPU") diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index 483aac9bf6b..55b0b854499 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -13,6 +13,7 @@ import numpy as np import openvino as ov +from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor.functions import numeric @@ -55,7 +56,7 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: tuple(a.shape), dtype, ) - return model([a])[0].data + return model([Tensor(a)])[0].data @numeric.backend.register(ov.Tensor) diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py index 6d82885dd0f..e1e45ef0391 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -60,8 +60,6 @@ class QuantizationTask(Enum): DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] -WEIGHT_SHAPE = (10000, 4) - MAX_MISALIGNMENT_FREQUENCY = { TensorDataType.float32: 1e-2, # tends to < 5e-6 TensorDataType.float16: 1e-2, # tends to < 5e-5 @@ -114,6 +112,7 @@ def openvino_available(available: bool): nncf.utils._openvino_available = original_value +@pytest.mark.parametrize("weight_shape", [(10000, 4)], ids=[""]) @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) @pytest.mark.parametrize( ("quantization_task", "tensor_backend"), @@ -131,8 +130,10 @@ def openvino_available(available: bool): @pytest.mark.parametrize("dtype", DATA_TYPES) @pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"]) @pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"]) -def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes): - d1, d2 = WEIGHT_SHAPE +def test_quantization_alignment( + weight_shape, config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes +): + d1, d2 = weight_shape group_size = config.group_size zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1) level_low, level_high = 0, 2**config.num_bits - 1 @@ -151,7 +152,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype weight_tensor_backend = TensorBackend.numpy # Generate input tensors - weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, weight_tensor_backend) + weight = get_random_float_tensor(weight_shape, dtype, weight_tensor_backend) precomputed_scale, precomputed_zero_point = None, None if precompute_s_zp: # When scale (and z.p) are precomputed, all inputs are assumed to be reshaped beforehand @@ -265,7 +266,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype if precompute_s_zp: scale = precomputed_scale else: - weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, TensorBackend.numpy) + weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy) with openvino_available(False): _, _, scale, _ = calculate_quantized_dequantized_weight( weight, config, REDUCTION_AXES, return_compressed_weight=True @@ -301,79 +302,3 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS, err_msg=f"Too large misalignment for {key}.", ) - - -# @pytest.mark.parametrize("mode", COMPRESSION_MODES) -# @pytest.mark.parametrize("group_size", [2]) -# def test_grouped_quantization(mode, group_size): -# if mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM]: -# pytest.skip("Group size is not applicable for INT8 modes") -# -# # Generate random weight tensor -# weight_shape = (128, 4) -# weight = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy) -# -# # Create WeightCompressionConfig -# config = WeightCompressionConfig(mode, group_size=group_size) -# -# # Patch is_openvino_available to control the implementation -# with patch("nncf.utils.is_openvino_available", return_value=False): -# # Reference implementation -# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES) -# -# with patch("nncf.utils.is_openvino_available", return_value=True): -# # OpenVINO implementation -# ov_model_params = OVModelParameters(weight.dtype) -# decompressed_weight_ov = calculate_quantized_dequantized_weight( -# weight, config, REDUCTION_AXES, ov_model_params=ov_model_params -# ) -# -# # Compare decompressed weights -# np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4) -# -# -# def test_weight_dtypes(): -# # Test different weight data types -# weight_shape = (128, 4) -# for dtype in DATA_TYPES: -# weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy) -# config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM) -# -# # Reference implementation -# with patch("nncf.utils.is_openvino_available", return_value=False): -# decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES) -# -# # OpenVINO implementation -# with patch("nncf.utils.is_openvino_available", return_value=True): -# ov_model_params = OVModelParameters(weight.dtype) -# decompressed_weight_ov = calculate_quantized_dequantized_weight( -# weight, config, REDUCTION_AXES, ov_model_params=ov_model_params -# ) -# -# # Compare decompressed weights -# np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4) -# -# -# def test_tensor_backends(): -# # Test different tensor backends for do_int_quantization -# weight_shape = (128, 4) -# weight_numpy = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy) -# weight_ov = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.ov) -# config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM) -# -# # Reference implementation with numpy backend -# with patch("nncf.utils.is_openvino_available", return_value=False): -# compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, REDUCTION_AXES) -# -# # OpenVINO implementation with OV backend -# with patch("nncf.utils.is_openvino_available", return_value=True): -# ov_model_params = OVModelParameters(weight_ov.dtype) -# compressed_weight_ov, scale_ov = do_int_quantization( -# weight_ov, config, REDUCTION_AXES, ov_model_params=ov_model_params -# ) -# -# # Compare compressed weights -# np.testing.assert_allclose(compressed_weight_ref.data, compressed_weight_ov.data, atol=1e-5, rtol=1e-4) -# -# # Compare scales -# np.testing.assert_allclose(scale_ref.data, scale_ov.data, atol=1e-5, rtol=1e-4) From 57a0931fb0e60675e2e9970d145b96b6e86a72e0 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 11 Nov 2024 16:32:39 +0100 Subject: [PATCH 27/73] Strict input/output data types --- .../weight_compression/openvino_modeling.py | 159 +++++++++++++----- .../weight_compression/weight_lowering.py | 43 +++-- nncf/tensor/functions/ov.py | 4 +- .../quantization/test_openvino_modeling.py | 4 +- 4 files changed, 149 insertions(+), 61 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index a2092604b1a..cec16ce8bb7 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -11,13 +11,12 @@ from dataclasses import dataclass from functools import partial -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import numpy as np import openvino as ov from openvino.runtime import opset13 as opset -from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.results_caching import ResultsCacheContainer from nncf.results_caching import cache_results @@ -34,8 +33,8 @@ @dataclass class OVModelParameters: - input_dtype: TensorDataType - output_dtype: Optional[TensorDataType] = None + input_dtypes: Optional[Dict[str, TensorDataType]] = None + output_dtypes: Optional[Dict[str, TensorDataType]] = None dynamic_shapes: bool = False recompile: bool = False release_memory: bool = True @@ -46,8 +45,8 @@ class OVModelParameters: def __hash__(self): return hash( ( - self.input_dtype, - self.output_dtype, + None if self.output_dtypes is None else frozenset(self.input_dtypes.items()), + None if self.output_dtypes is None else frozenset(self.output_dtypes.items()), self.dynamic_shapes, self.recompile, self.release_memory, @@ -61,6 +60,15 @@ def __hash__(self): def run_model( ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList ) -> TensorList: + # Check that input dtypes match the expected dtypes + for i, inp in enumerate(compiled_model.inputs): + input_name = inp.any_name + actual_dtype = inputs[i].dtype + expected_dtype = ov_model_params.input_dtypes[input_name] + if actual_dtype != expected_dtype: + raise ValueError(f"Expected input '{input_name}' to be {expected_dtype}. But found: {actual_dtype}.") + + # Infer the model inputs = [inp.data for inp in inputs] if return_ov_tensors: infer_request = compiled_model.create_infer_request() @@ -149,12 +157,60 @@ def _build_compress_model( reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, ) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]: - weight = opset.parameter(weight_shape, name="w", dtype=DTYPE_MAP_OV[ov_model_params.input_dtype]) + input_dtypes = ov_model_params.input_dtypes + if input_dtypes is None: + raise ValueError("Input dtypes must be provided.") + output_dtypes = ov_model_params.output_dtypes + if output_dtypes is None: + raise ValueError("Output dtypes must be provided.") + + weight_dtype = input_dtypes.get("weight") + input_scale_dtype = input_dtypes.get("scale", None) + input_zero_point_dtype = input_dtypes.get("zero_point", None) + compressed_weight_dtype = output_dtypes.get("compressed_weight") + output_scale_dtype = output_dtypes.get("scale", None) + output_zero_point_dtype = output_dtypes.get("zero_point", None) + + # Validate input dtypes + valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] + if weight_dtype not in valid_weight_dtypes: + raise ValueError( + f"Weight must be one of the following data types: {valid_weight_dtypes}. But found: {weight_dtype}." + ) + if scale_shape is not None and input_scale_dtype != TensorDataType.float32: + raise ValueError(f"Input scale must be of float32 data type. But found: {input_scale_dtype}.") + if zero_point_shape is not None and input_zero_point_dtype != TensorDataType.int32: + raise ValueError(f"Input zero point must be of int32 data type. But found: {input_zero_point_dtype}.") + + # Validate output dtypes + valid_compressed_weight_dtypes = [ + TensorDataType.int32, + TensorDataType.int8, + TensorDataType.uint8, + TensorDataType.int4, + TensorDataType.uint4, + ] + if compressed_weight_dtype not in valid_compressed_weight_dtypes: + raise ValueError( + f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. " + f"But found: {compressed_weight_dtype}." + ) + if scale_shape is None and output_scale_dtype != TensorDataType.float32: + raise ValueError(f"Output scale must be of float32 data type. But found: {output_scale_dtype}.") + is_int_asym = config.is_int_asym + if is_int_asym and zero_point_shape is None and output_zero_point_dtype not in valid_compressed_weight_dtypes: + raise ValueError( + f"Output zero point must be of one of the following data types: {valid_compressed_weight_dtypes}. " + f"But found: {output_zero_point_dtype}." + ) + + # Build OV model + weight = opset.parameter(weight_shape, name="weight", dtype=DTYPE_MAP_OV[weight_dtype]) ov_parameters = [weight] num_bits = config.num_bits eps = np.finfo(np.float32).eps - if config.is_int_asym: + if is_int_asym: level_low = 0 level_high = 2**num_bits - 1 else: @@ -164,11 +220,11 @@ def _build_compress_model( min_values = None if scale_shape is not None: # Scale is given as an input - scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32) + scale = opset.parameter(scale_shape, name="scale", dtype=ov.Type.f32) ov_parameters.append(scale) else: # Compute scale - if config.is_int_asym: + if is_int_asym: min_values = opset.reduce_min( weight, reduction_axes=reduction_axes, keep_dims=True ) # [a1, r, a2] -> [a1, 1, a2] @@ -192,47 +248,36 @@ def _build_compress_model( zero_point = None if zero_point_shape is not None: # Zero point is given as an input - zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) + zero_point = opset.parameter(zero_point_shape, name="zero_point", dtype=ov.Type.i32) ov_parameters.append(zero_point) + # Cast to float32 for an addition later zero_point = opset.convert(zero_point, ov.Type.f32) - elif config.is_int_asym: + elif is_int_asym: # Compute zero point if min_values is None: min_values = opset.reduce_min( weight, reduction_axes=reduction_axes, keep_dims=True ) # [a1, r, a2] -> [a1, 1, a2] min_values = opset.convert(min_values, ov.Type.f32) - zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) zero_point = opset.clamp(zero_point, level_low, level_high) if weight.get_element_type() != ov.Type.f32: weight = opset.convert(weight, ov.Type.f32) - compressed_w = weight / scale + compressed_weight = weight / scale - if config.is_int_asym: - if ov_model_params.output_dtype is not None: - dtype = DTYPE_MAP_OV[ov_model_params.output_dtype] - else: - dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 - compressed_w += zero_point - else: - if ov_model_params.output_dtype is not None: - dtype = DTYPE_MAP_OV[ov_model_params.output_dtype] - else: - dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4 + if is_int_asym: + compressed_weight += zero_point - compressed_w = opset.round(compressed_w) - compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high) - compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights") + compressed_weight = opset.round(compressed_weight) + compressed_weight = opset.clamp(opset.round(compressed_weight), level_low, level_high) + compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) - ov_results = [compressed_w] + ov_results = [compressed_weight] if len(ov_parameters) == 1: ov_results.append(scale) if zero_point is not None: - zero_point_dtype = compressed_w.get_element_type() if ov_model_params.return_ov_tensors else ov.Type.i32 - if zero_point.get_element_type() != zero_point_dtype: - zero_point = opset.convert(zero_point, zero_point_dtype) + zero_point = opset.convert(zero_point, DTYPE_MAP_OV[output_zero_point_dtype]) ov_results.append(zero_point) if return_nodes: @@ -254,6 +299,17 @@ def _build_compress_decompress_model( reduction_axes: Optional[Tuple] = None, return_compressed_weight: Optional[bool] = False, ) -> ModelCallable: + input_dtypes = ov_model_params.input_dtypes + if input_dtypes is None: + raise ValueError("Input dtypes must be provided.") + output_dtypes = ov_model_params.output_dtypes + if output_dtypes is None: + raise ValueError("Output dtypes must be provided.") + + decompressed_weight_dtype = output_dtypes.get("decompressed_weight") + if decompressed_weight_dtype != TensorDataType.float32: + raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.") + ov_parameters, ov_results = _build_compress_model( config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True ) @@ -261,41 +317,54 @@ def _build_compress_decompress_model( if config.is_int_asym: if len(ov_parameters) == 1: # weight -> compressed_weight, scale, zero_point - compressed_w, scale, zero_point = ov_results + compressed_weight, scale, zero_point = ov_results else: # weight, scale, zero_point -> compressed_weight - compressed_w = ov_results[0] + compressed_weight = ov_results[0] scale, zero_point = ov_parameters[1:] - compressed_w_ = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32) - decompressed_w = scale * opset.convert(compressed_w_, ov.Type.f32) + compressed_weight = opset.convert(compressed_weight, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32) else: if len(ov_parameters) == 1: # weight -> compressed_weight, scale - compressed_w, scale = ov_results + compressed_weight, scale = ov_results else: # weight, scale -> compressed_weight - compressed_w = ov_results[0] + compressed_weight = ov_results[0] scale = ov_parameters[1] - decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale - ov_results = [decompressed_w] + ov_results if return_compressed_weight else [decompressed_w] + if compressed_weight.get_element_type() != ov.Type.f32: + compressed_weight = opset.convert(compressed_weight, ov.Type.f32) + decompressed_weight = opset.multiply(scale, compressed_weight) + + ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight] model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) -def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable: +def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> ModelCallable: if ov_model_params.dynamic_shapes: arg_shape = (-1,) * len(arg_shape) - return _build_astype_model(ov_model_params, arg_shape, dtype) + return _build_astype_model(ov_model_params, arg_shape) @cache_results(OV_MODEL_CACHE) -def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable: - arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[ov_model_params.input_dtype]) - res = opset.convert(arg, DTYPE_MAP_OV[dtype]) +def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> ModelCallable: + input_dtypes = ov_model_params.input_dtypes + if input_dtypes is None: + raise ValueError("Input dtypes must be provided.") + output_dtypes = ov_model_params.output_dtypes + if output_dtypes is None: + raise ValueError("Output dtypes must be provided.") + if "input" not in input_dtypes: + raise ValueError("Input dtype is required.") + if "output" not in output_dtypes: + raise ValueError("Output dtype is required.") + + arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[input_dtypes["input"]], name="input") + res = opset.convert(arg, DTYPE_MAP_OV[output_dtypes["output"]]) model = ov.Model([res], [arg]) compiled_model = ov.compile_model(model, device_name="CPU") diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 20c4b3e539a..4e1cabc3790 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -8,6 +8,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -490,13 +491,25 @@ def do_int_quantization( scale_shape = None if precomputed_scale is None else precomputed_scale.shape zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape - if ov_model_params is None: - ov_model_params = OVModelParameters(weight.dtype) - if config.num_bits == 4: - if weight.backend == TensorBackend.ov: - ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov - else: - ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8 + ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params) + ov_model_params.input_dtypes = ov_model_params.input_dtypes or { + "weight": weight.dtype, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + } + ov_model_params.output_dtypes = ov_model_params.output_dtypes or { + "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + } + if config.num_bits == 4 and weight.backend == TensorBackend.ov: + # Return ov tensors in target precision to seamlessly insert them into openvino model later + ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov + compressed_weight_dtype = TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4 + ov_model_params.output_dtypes.update( + {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype} + ) + # ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) # ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) # ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) @@ -582,10 +595,18 @@ def calculate_quantized_dequantized_weight( scale_shape = precomputed_scale.shape if precomputed_scale is not None else None zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None - if ov_model_params is None: - ov_model_params = OVModelParameters(weight.dtype) - if return_compressed_weight and config.num_bits == 4: - ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8 + ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params) + ov_model_params.input_dtypes = ov_model_params.input_dtypes or { + "weight": weight.dtype, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + } + ov_model_params.output_dtypes = ov_model_params.output_dtypes or { + "decompressed_weight": TensorDataType.float32, + "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + } model = get_compress_decompress_weight_model( ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index 55b0b854499..a868d310190 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -45,7 +45,8 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: model = get_astype_model( OVModelParameters( - input_dtype=a_dtype, + input_dtypes={"input": a_dtype}, + output_dtypes={"output": dtype}, dynamic_shapes=True, recompile=False, release_memory=True, @@ -54,7 +55,6 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: return_ov_tensors=True, ), tuple(a.shape), - dtype, ) return model([Tensor(a)])[0].data diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py index e1e45ef0391..12de6121ac6 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -57,7 +57,6 @@ class QuantizationTask(Enum): WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2), ] - DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] MAX_MISALIGNMENT_FREQUENCY = { @@ -74,7 +73,6 @@ class QuantizationTask(Enum): REDUCTION_AXES = (1,) - RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer() @@ -179,7 +177,7 @@ def test_quantization_alignment( kwargs = {} if cb == ComputationBackend.OV: - ov_model_params = OVModelParameters(weight.dtype, dynamic_shapes=not static_shapes) + ov_model_params = OVModelParameters(dynamic_shapes=not static_shapes) kwargs["ov_model_params"] = ov_model_params if quantization_task == QuantizationTask.Q_DQ_RQ: kwargs["return_compressed_weight"] = True From 1010fcf56bc62140440383d03df2a68853d041f5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 11 Nov 2024 17:58:55 +0100 Subject: [PATCH 28/73] Add dynamic shapes test --- .../quantization/test_openvino_modeling.py | 89 ++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py index 12de6121ac6..9321087939e 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -20,7 +20,8 @@ from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight @@ -300,3 +301,89 @@ def test_quantization_alignment( MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS, err_msg=f"Too large misalignment for {key}.", ) + + +@pytest.mark.parametrize("get_ov_model_fn,input_shapes,ref_cache_size", [ + ( + lambda dynamic_shapes, input_shapes: get_compress_weight_model( + OVModelParameters( + input_dtypes={ + "weight": TensorDataType.float32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32 + }, + output_dtypes={ + "compressed_weight": TensorDataType.uint8 + }, + dynamic_shapes=dynamic_shapes, + ), + WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + *input_shapes, + reduction_axes=REDUCTION_AXES, + ), + [ + [(10, 4), (10, 1), (10, 1)], + [(20, 6), (20, 1), (20, 1)], + [(20, 8), (20, 1), (20, 1)], + [(10, 4, 4), (10, 4, 1), (10, 4, 1),], + [(10, 8, 4), (10, 8, 1), (10, 8, 1),], + ], + {False: 5, True: 2} + ), + ( + lambda dynamic_shapes, input_shapes: get_compress_decompress_weight_model( + OVModelParameters( + input_dtypes={ + "weight": TensorDataType.float32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32 + }, + output_dtypes={ + "compressed_weight": TensorDataType.int32, + "decompressed_weight": TensorDataType.float32, + }, + dynamic_shapes=dynamic_shapes, + ), + WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + *input_shapes, + reduction_axes=REDUCTION_AXES, + ), + [ + [(10, 4), (10, 1), (10, 1)], + [(20, 6), (20, 1), (20, 1)], + [(20, 8), (20, 1), (20, 1)], + [(10, 4, 4), (10, 4, 1), (10, 4, 1),], + [(10, 8, 4), (10, 8, 1), (10, 8, 1),], + ], + {False: 10, True: 4} + ), + ( + lambda dynamic_shapes, input_shape: get_astype_model( + OVModelParameters( + input_dtypes={ + "input": TensorDataType.float32, + }, + output_dtypes={ + "output": TensorDataType.int32, + }, + dynamic_shapes=dynamic_shapes, + ), + input_shape, + ), + [ + (10, 4), + (20, 6), + (20, 8), + (10, 4, 4), + (10, 8, 4), + ], + {False: 5, True: 2} + ), +]) +@pytest.mark.parametrize("dynamic_shapes", [False, True]) +def test_dynamic_shapes(get_ov_model_fn, input_shapes, ref_cache_size, dynamic_shapes): + # Check that model cache contains fewer elements with dynamic shapes included + OV_MODEL_CACHE.clear() + for shape in input_shapes: + get_ov_model_fn(dynamic_shapes, shape) + assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes] From 6e54fba431ee2989e218d0458eb50953c2e5e47b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 13 Nov 2024 14:44:52 +0100 Subject: [PATCH 29/73] ov modeling tests --- .../weight_compression/openvino_modeling.py | 31 +- .../quantization/test_openvino_modeling.py | 267 +++++++++++++----- 2 files changed, 212 insertions(+), 86 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index cec16ce8bb7..1cab401ee01 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy from dataclasses import dataclass from functools import partial from typing import Callable, Dict, List, Optional, Tuple, Union @@ -96,6 +97,7 @@ def get_compress_weight_model( scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, + return_nodes: Optional[bool] = False, ) -> ModelCallable: if scale_shape is None and zero_point_shape is not None: raise Exception("Zero point shape can only be provided if scale shape is provided.") @@ -114,7 +116,7 @@ def get_compress_weight_model( scale_shape, zero_point_shape, reduction_axes, - return_nodes=False, + return_nodes=return_nodes, disable_caching=ov_model_params.recompile, ) @@ -165,11 +167,11 @@ def _build_compress_model( raise ValueError("Output dtypes must be provided.") weight_dtype = input_dtypes.get("weight") - input_scale_dtype = input_dtypes.get("scale", None) - input_zero_point_dtype = input_dtypes.get("zero_point", None) + input_scale_dtype = input_dtypes.get("scale", TensorDataType.float32) + input_zero_point_dtype = input_dtypes.get("zero_point", TensorDataType.int32) compressed_weight_dtype = output_dtypes.get("compressed_weight") - output_scale_dtype = output_dtypes.get("scale", None) - output_zero_point_dtype = output_dtypes.get("zero_point", None) + output_scale_dtype = output_dtypes.get("scale", TensorDataType.float32) + output_zero_point_dtype = output_dtypes.get("zero_point", TensorDataType.int32) # Validate input dtypes valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] @@ -190,7 +192,7 @@ def _build_compress_model( TensorDataType.int4, TensorDataType.uint4, ] - if compressed_weight_dtype not in valid_compressed_weight_dtypes: + if compressed_weight_dtype not in valid_compressed_weight_dtypes + [TensorDataType.float32]: raise ValueError( f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. " f"But found: {compressed_weight_dtype}." @@ -271,7 +273,8 @@ def _build_compress_model( compressed_weight = opset.round(compressed_weight) compressed_weight = opset.clamp(opset.round(compressed_weight), level_low, level_high) - compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) + if compressed_weight_dtype != TensorDataType.float32: + compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) ov_results = [compressed_weight] if len(ov_parameters) == 1: @@ -310,8 +313,12 @@ def _build_compress_decompress_model( if decompressed_weight_dtype != TensorDataType.float32: raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.") - ov_parameters, ov_results = _build_compress_model( - config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True + if "compressed_weight" not in output_dtypes: + ov_model_params = copy.deepcopy(ov_model_params) + ov_model_params.output_dtypes["compressed_weight"] = TensorDataType.float32 + + ov_parameters, ov_results = get_compress_weight_model( + ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True ) if config.is_int_asym: @@ -344,10 +351,10 @@ def _build_compress_decompress_model( return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) -def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> ModelCallable: +def get_astype_model(ov_model_params: OVModelParameters, input_shape: Tuple) -> ModelCallable: if ov_model_params.dynamic_shapes: - arg_shape = (-1,) * len(arg_shape) - return _build_astype_model(ov_model_params, arg_shape) + input_shape = (-1,) * len(input_shape) + return _build_astype_model(ov_model_params, input_shape, disable_caching=ov_model_params.recompile) @cache_results(OV_MODEL_CACHE) diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py index 9321087939e..d7d562cff6c 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -8,6 +8,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import inspect from collections import defaultdict from contextlib import contextmanager @@ -20,7 +21,8 @@ from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model @@ -303,87 +305,204 @@ def test_quantization_alignment( ) -@pytest.mark.parametrize("get_ov_model_fn,input_shapes,ref_cache_size", [ - ( - lambda dynamic_shapes, input_shapes: get_compress_weight_model( - OVModelParameters( - input_dtypes={ - "weight": TensorDataType.float32, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32 - }, - output_dtypes={ - "compressed_weight": TensorDataType.uint8 - }, - dynamic_shapes=dynamic_shapes, - ), - WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), - *input_shapes, +class ModelGetter: + def __init__(self, get_model_fn, ov_model_params_kwargs, get_model_kwargs): + self._get_model_fn = get_model_fn + self._ov_model_params_kwargs = ov_model_params_kwargs + self._get_model_kwargs = get_model_kwargs + + def get(self, ov_model_params_kwargs=None, get_model_kwargs=None): + ov_model_params_kwargs = ov_model_params_kwargs or {} + get_model_kwargs = get_model_kwargs or {} + return self._get_model_fn( + OVModelParameters(**{**self._ov_model_params_kwargs, **ov_model_params_kwargs}), + **{**self._get_model_kwargs, **get_model_kwargs}, + ) + + +MODEL_GETTERS = [ + ModelGetter( + get_model_fn=get_compress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "weight": TensorDataType.float32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + output_dtypes={"compressed_weight": TensorDataType.uint8}, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), + scale_shape=(10, 1), + zero_point_shape=(10, 1), + ), + ), + ModelGetter( + get_model_fn=get_compress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={"weight": TensorDataType.float32}, + output_dtypes={ + "compressed_weight": TensorDataType.uint8, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), reduction_axes=REDUCTION_AXES, ), - [ - [(10, 4), (10, 1), (10, 1)], - [(20, 6), (20, 1), (20, 1)], - [(20, 8), (20, 1), (20, 1)], - [(10, 4, 4), (10, 4, 1), (10, 4, 1),], - [(10, 8, 4), (10, 8, 1), (10, 8, 1),], - ], - {False: 5, True: 2} ), - ( - lambda dynamic_shapes, input_shapes: get_compress_decompress_weight_model( - OVModelParameters( - input_dtypes={ - "weight": TensorDataType.float32, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32 - }, - output_dtypes={ - "compressed_weight": TensorDataType.int32, - "decompressed_weight": TensorDataType.float32, - }, - dynamic_shapes=dynamic_shapes, - ), - WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), - *input_shapes, + ModelGetter( + get_model_fn=get_compress_decompress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "weight": TensorDataType.float32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + output_dtypes={ + "decompressed_weight": TensorDataType.float32, + }, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), + scale_shape=(10, 1), + zero_point_shape=(10, 1), + ), + ), + ModelGetter( + get_model_fn=get_compress_decompress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "weight": TensorDataType.float32, + }, + output_dtypes={ + "decompressed_weight": TensorDataType.float32, + "compressed_weight": TensorDataType.int32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), reduction_axes=REDUCTION_AXES, + return_compressed_weight=True, ), - [ - [(10, 4), (10, 1), (10, 1)], - [(20, 6), (20, 1), (20, 1)], - [(20, 8), (20, 1), (20, 1)], - [(10, 4, 4), (10, 4, 1), (10, 4, 1),], - [(10, 8, 4), (10, 8, 1), (10, 8, 1),], - ], - {False: 10, True: 4} ), - ( - lambda dynamic_shapes, input_shape: get_astype_model( - OVModelParameters( - input_dtypes={ - "input": TensorDataType.float32, - }, - output_dtypes={ - "output": TensorDataType.int32, - }, - dynamic_shapes=dynamic_shapes, - ), - input_shape, + ModelGetter( + get_model_fn=get_astype_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "input": TensorDataType.float32, + }, + output_dtypes={ + "output": TensorDataType.bfloat16, + }, + ), + get_model_kwargs=dict( + input_shape=(10, 4), ), - [ - (10, 4), - (20, 6), - (20, 8), - (10, 4, 4), - (10, 8, 4), - ], - {False: 5, True: 2} ), -]) +] + + +@pytest.mark.parametrize( + "model_getter,input_shapes,ref_cache_size", + [ + ( + MODEL_GETTERS[0], + [ + dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)), + dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)), + dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)), + ], + {False: 5, True: 2}, + ), + ( + MODEL_GETTERS[1], + [ + dict(weight_shape=(10, 4)), + dict(weight_shape=(20, 6)), + dict(weight_shape=(20, 8)), + dict(weight_shape=(10, 4, 4)), + dict(weight_shape=(10, 8, 4)), + ], + {False: 5, True: 2}, + ), + ( + MODEL_GETTERS[2], + [ + dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)), + dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)), + dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)), + ], + {False: 10, True: 4}, + ), + ( + MODEL_GETTERS[3], + [ + dict(weight_shape=(10, 4)), + dict(weight_shape=(20, 6)), + dict(weight_shape=(20, 8)), + dict(weight_shape=(10, 4, 4)), + dict(weight_shape=(10, 8, 4)), + ], + {False: 10, True: 4}, + ), + ( + MODEL_GETTERS[4], + [ + dict(input_shape=(10, 1)), + dict(input_shape=(10, 2)), + dict(input_shape=(20, 3)), + dict(input_shape=(10, 4, 4)), + dict(input_shape=(10, 8, 4)), + ], + {False: 5, True: 2}, + ), + ], +) @pytest.mark.parametrize("dynamic_shapes", [False, True]) -def test_dynamic_shapes(get_ov_model_fn, input_shapes, ref_cache_size, dynamic_shapes): - # Check that model cache contains fewer elements with dynamic shapes included +def test_dynamic_shapes(model_getter, input_shapes, ref_cache_size, dynamic_shapes): + # Check that model cache contains fewer elements with dynamic shapes enabled OV_MODEL_CACHE.clear() - for shape in input_shapes: - get_ov_model_fn(dynamic_shapes, shape) + for shape_kwargs in input_shapes: + model_getter.get(ov_model_params_kwargs=dict(dynamic_shapes=dynamic_shapes), get_model_kwargs=shape_kwargs) assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes] + + +@pytest.mark.parametrize("model_getter", MODEL_GETTERS) +@pytest.mark.parametrize("recompile", [True, False]) +def test_recompile(model_getter, recompile): + OV_MODEL_CACHE.clear() + model_getter.get(ov_model_params_kwargs=dict(recompile=recompile)) + ref_size = 0 if recompile else (2 if model_getter._get_model_fn == get_compress_decompress_weight_model else 1) + assert len(OV_MODEL_CACHE._cache) == ref_size + + +@pytest.mark.parametrize("model_getter", MODEL_GETTERS) +@pytest.mark.parametrize("return_ov_tensors", [True, False]) +def test_return_ov_tensors(model_getter, return_ov_tensors): + OV_MODEL_CACHE.clear() + inputs = [] + for input_name, input_dtype in model_getter._ov_model_params_kwargs["input_dtypes"].items(): + input_shape = model_getter._get_model_kwargs.get(f"{input_name}_shape") + if input_dtype in [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]: + inp = get_random_float_tensor(input_shape, input_dtype, TensorBackend.numpy) + else: + inp = get_random_integer_tensor(input_shape, 0, 16, input_dtype, TensorBackend.numpy) + inputs.append(inp) + + model_run_fn = model_getter.get(ov_model_params_kwargs=dict(return_ov_tensors=return_ov_tensors)) + outputs = model_run_fn(inputs) + + all_outputs_are_ov_tensors = all([out.backend == TensorBackend.ov for out in outputs]) + assert all_outputs_are_ov_tensors == return_ov_tensors From 8ac0fe2ef0fa195ebf4e2ab4930b0a41f4da5e86 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 13 Nov 2024 14:47:19 +0100 Subject: [PATCH 30/73] Move cache_results decorator --- nncf/common/utils/decorators.py | 44 +++++++++++++++ .../weight_compression/openvino_modeling.py | 3 +- nncf/results_caching.py | 55 ------------------- .../quantization/test_openvino_modeling.py | 3 +- 4 files changed, 46 insertions(+), 59 deletions(-) delete mode 100644 nncf/results_caching.py diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py index d47c78c473a..0542c91e578 100644 --- a/nncf/common/utils/decorators.py +++ b/nncf/common/utils/decorators.py @@ -10,6 +10,7 @@ # limitations under the License. from importlib import import_module +import inspect from typing import Any, Callable, Dict, List from nncf.common.logging import nncf_logger @@ -51,3 +52,46 @@ def wrapped_f(*args: Any, **kwargs: Any): # type: ignore return wrapped_f return wrap + + +class ResultsCacheContainer: + def __init__(self): + self._cache = {} + self._access_count = {} + + def clear(self): + self._cache.clear() + self._access_count.clear() + + def is_empty(self): + return len(self._cache) == 0 + + def __getitem__(self, item): + self._access_count[item] += 1 + return self._cache[item] + + def __setitem__(self, key, value): + self._access_count[key] = 0 + self._cache[key] = value + + def __contains__(self, item): + return item in self._cache + + +def cache_results(cache: ResultsCacheContainer): + def decorator(func): + def wrapper(*args, disable_caching=False, **kwargs): + sig = inspect.signature(func) + new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} + new_kwargs.update(kwargs) + cache_key = (func.__name__, frozenset(new_kwargs.items())) + if cache_key in cache: + return cache[cache_key] + result = func(*args, **kwargs) + if not disable_caching: + cache[cache_key] = result + return result + + return wrapper + + return decorator diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 1cab401ee01..2acd9733c82 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -18,9 +18,8 @@ import openvino as ov from openvino.runtime import opset13 as opset +from nncf.common.utils.decorators import ResultsCacheContainer, cache_results from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.results_caching import ResultsCacheContainer -from nncf.results_caching import cache_results from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV diff --git a/nncf/results_caching.py b/nncf/results_caching.py deleted file mode 100644 index 9b314863108..00000000000 --- a/nncf/results_caching.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect - - -class ResultsCacheContainer: - def __init__(self): - self._cache = {} - self._access_count = {} - - def clear(self): - self._cache.clear() - self._access_count.clear() - - def is_empty(self): - return len(self._cache) == 0 - - def __getitem__(self, item): - self._access_count[item] += 1 - return self._cache[item] - - def __setitem__(self, key, value): - self._access_count[key] = 0 - self._cache[key] = value - - def __contains__(self, item): - return item in self._cache - - -def cache_results(cache: ResultsCacheContainer): - def decorator(func): - def wrapper(*args, disable_caching=False, **kwargs): - sig = inspect.signature(func) - new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} - new_kwargs.update(kwargs) - cache_key = (func.__name__, frozenset(new_kwargs.items())) - if cache_key in cache: - return cache[cache_key] - result = func(*args, **kwargs) - if not disable_caching: - cache[cache_key] = result - return result - - return wrapper - - return decorator diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py index d7d562cff6c..3fd270132a6 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_openvino_modeling.py @@ -20,6 +20,7 @@ import pytest from nncf import CompressWeightsMode +from nncf.common.utils.decorators import cache_results, ResultsCacheContainer from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters @@ -29,8 +30,6 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization -from nncf.results_caching import ResultsCacheContainer -from nncf.results_caching import cache_results from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor import functions as fns From ded66f3447676a3b98af3b4dffdc8bf6e6b6f9ac Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 13 Nov 2024 14:53:07 +0100 Subject: [PATCH 31/73] Tests reorgantization --- ...ing.py => test_ov_modeling_compression.py} | 205 ---------------- .../openvino/native/test_openvino_modeling.py | 224 ++++++++++++++++++ 2 files changed, 224 insertions(+), 205 deletions(-) rename tests/openvino/native/quantization/{test_openvino_modeling.py => test_ov_modeling_compression.py} (64%) create mode 100644 tests/openvino/native/test_openvino_modeling.py diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py similarity index 64% rename from tests/openvino/native/quantization/test_openvino_modeling.py rename to tests/openvino/native/quantization/test_ov_modeling_compression.py index 3fd270132a6..682ff604901 100644 --- a/tests/openvino/native/quantization/test_openvino_modeling.py +++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py @@ -22,9 +22,7 @@ from nncf import CompressWeightsMode from nncf.common.utils.decorators import cache_results, ResultsCacheContainer from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters -from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight @@ -302,206 +300,3 @@ def test_quantization_alignment( MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS, err_msg=f"Too large misalignment for {key}.", ) - - -class ModelGetter: - def __init__(self, get_model_fn, ov_model_params_kwargs, get_model_kwargs): - self._get_model_fn = get_model_fn - self._ov_model_params_kwargs = ov_model_params_kwargs - self._get_model_kwargs = get_model_kwargs - - def get(self, ov_model_params_kwargs=None, get_model_kwargs=None): - ov_model_params_kwargs = ov_model_params_kwargs or {} - get_model_kwargs = get_model_kwargs or {} - return self._get_model_fn( - OVModelParameters(**{**self._ov_model_params_kwargs, **ov_model_params_kwargs}), - **{**self._get_model_kwargs, **get_model_kwargs}, - ) - - -MODEL_GETTERS = [ - ModelGetter( - get_model_fn=get_compress_weight_model, - ov_model_params_kwargs=dict( - input_dtypes={ - "weight": TensorDataType.float32, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - }, - output_dtypes={"compressed_weight": TensorDataType.uint8}, - ), - get_model_kwargs=dict( - config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), - weight_shape=(10, 4), - scale_shape=(10, 1), - zero_point_shape=(10, 1), - ), - ), - ModelGetter( - get_model_fn=get_compress_weight_model, - ov_model_params_kwargs=dict( - input_dtypes={"weight": TensorDataType.float32}, - output_dtypes={ - "compressed_weight": TensorDataType.uint8, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - }, - ), - get_model_kwargs=dict( - config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), - weight_shape=(10, 4), - reduction_axes=REDUCTION_AXES, - ), - ), - ModelGetter( - get_model_fn=get_compress_decompress_weight_model, - ov_model_params_kwargs=dict( - input_dtypes={ - "weight": TensorDataType.float32, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - }, - output_dtypes={ - "decompressed_weight": TensorDataType.float32, - }, - ), - get_model_kwargs=dict( - config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), - weight_shape=(10, 4), - scale_shape=(10, 1), - zero_point_shape=(10, 1), - ), - ), - ModelGetter( - get_model_fn=get_compress_decompress_weight_model, - ov_model_params_kwargs=dict( - input_dtypes={ - "weight": TensorDataType.float32, - }, - output_dtypes={ - "decompressed_weight": TensorDataType.float32, - "compressed_weight": TensorDataType.int32, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - }, - ), - get_model_kwargs=dict( - config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), - weight_shape=(10, 4), - reduction_axes=REDUCTION_AXES, - return_compressed_weight=True, - ), - ), - ModelGetter( - get_model_fn=get_astype_model, - ov_model_params_kwargs=dict( - input_dtypes={ - "input": TensorDataType.float32, - }, - output_dtypes={ - "output": TensorDataType.bfloat16, - }, - ), - get_model_kwargs=dict( - input_shape=(10, 4), - ), - ), -] - - -@pytest.mark.parametrize( - "model_getter,input_shapes,ref_cache_size", - [ - ( - MODEL_GETTERS[0], - [ - dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)), - dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)), - dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)), - dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)), - dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)), - ], - {False: 5, True: 2}, - ), - ( - MODEL_GETTERS[1], - [ - dict(weight_shape=(10, 4)), - dict(weight_shape=(20, 6)), - dict(weight_shape=(20, 8)), - dict(weight_shape=(10, 4, 4)), - dict(weight_shape=(10, 8, 4)), - ], - {False: 5, True: 2}, - ), - ( - MODEL_GETTERS[2], - [ - dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)), - dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)), - dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)), - dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)), - dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)), - ], - {False: 10, True: 4}, - ), - ( - MODEL_GETTERS[3], - [ - dict(weight_shape=(10, 4)), - dict(weight_shape=(20, 6)), - dict(weight_shape=(20, 8)), - dict(weight_shape=(10, 4, 4)), - dict(weight_shape=(10, 8, 4)), - ], - {False: 10, True: 4}, - ), - ( - MODEL_GETTERS[4], - [ - dict(input_shape=(10, 1)), - dict(input_shape=(10, 2)), - dict(input_shape=(20, 3)), - dict(input_shape=(10, 4, 4)), - dict(input_shape=(10, 8, 4)), - ], - {False: 5, True: 2}, - ), - ], -) -@pytest.mark.parametrize("dynamic_shapes", [False, True]) -def test_dynamic_shapes(model_getter, input_shapes, ref_cache_size, dynamic_shapes): - # Check that model cache contains fewer elements with dynamic shapes enabled - OV_MODEL_CACHE.clear() - for shape_kwargs in input_shapes: - model_getter.get(ov_model_params_kwargs=dict(dynamic_shapes=dynamic_shapes), get_model_kwargs=shape_kwargs) - assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes] - - -@pytest.mark.parametrize("model_getter", MODEL_GETTERS) -@pytest.mark.parametrize("recompile", [True, False]) -def test_recompile(model_getter, recompile): - OV_MODEL_CACHE.clear() - model_getter.get(ov_model_params_kwargs=dict(recompile=recompile)) - ref_size = 0 if recompile else (2 if model_getter._get_model_fn == get_compress_decompress_weight_model else 1) - assert len(OV_MODEL_CACHE._cache) == ref_size - - -@pytest.mark.parametrize("model_getter", MODEL_GETTERS) -@pytest.mark.parametrize("return_ov_tensors", [True, False]) -def test_return_ov_tensors(model_getter, return_ov_tensors): - OV_MODEL_CACHE.clear() - inputs = [] - for input_name, input_dtype in model_getter._ov_model_params_kwargs["input_dtypes"].items(): - input_shape = model_getter._get_model_kwargs.get(f"{input_name}_shape") - if input_dtype in [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]: - inp = get_random_float_tensor(input_shape, input_dtype, TensorBackend.numpy) - else: - inp = get_random_integer_tensor(input_shape, 0, 16, input_dtype, TensorBackend.numpy) - inputs.append(inp) - - model_run_fn = model_getter.get(ov_model_params_kwargs=dict(return_ov_tensors=return_ov_tensors)) - outputs = model_run_fn(inputs) - - all_outputs_are_ov_tensors = all([out.backend == TensorBackend.ov for out in outputs]) - assert all_outputs_are_ov_tensors == return_ov_tensors diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py new file mode 100644 index 00000000000..71f9cd316fb --- /dev/null +++ b/tests/openvino/native/test_openvino_modeling.py @@ -0,0 +1,224 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pytest + +from nncf import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model +from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model +from nncf.tensor import TensorDataType, Tensor +from nncf.tensor.definitions import TensorBackend + +from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP + + +class ModelGetter: + def __init__(self, get_model_fn, ov_model_params_kwargs, get_model_kwargs): + self._get_model_fn = get_model_fn + self._ov_model_params_kwargs = ov_model_params_kwargs + self._get_model_kwargs = get_model_kwargs + + def get(self, ov_model_params_kwargs=None, get_model_kwargs=None): + ov_model_params_kwargs = ov_model_params_kwargs or {} + get_model_kwargs = get_model_kwargs or {} + return self._get_model_fn( + OVModelParameters(**{**self._ov_model_params_kwargs, **ov_model_params_kwargs}), + **{**self._get_model_kwargs, **get_model_kwargs}, + ) + + +MODEL_GETTERS = [ + ModelGetter( + get_model_fn=get_compress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "weight": TensorDataType.float32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + output_dtypes={"compressed_weight": TensorDataType.uint8}, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), + scale_shape=(10, 1), + zero_point_shape=(10, 1), + ), + ), + ModelGetter( + get_model_fn=get_compress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={"weight": TensorDataType.float32}, + output_dtypes={ + "compressed_weight": TensorDataType.uint8, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), + reduction_axes=(1,), + ), + ), + ModelGetter( + get_model_fn=get_compress_decompress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "weight": TensorDataType.float32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + output_dtypes={ + "decompressed_weight": TensorDataType.float32, + }, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), + scale_shape=(10, 1), + zero_point_shape=(10, 1), + ), + ), + ModelGetter( + get_model_fn=get_compress_decompress_weight_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "weight": TensorDataType.float32, + }, + output_dtypes={ + "decompressed_weight": TensorDataType.float32, + "compressed_weight": TensorDataType.int32, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + }, + ), + get_model_kwargs=dict( + config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), + weight_shape=(10, 4), + reduction_axes=(1,), + return_compressed_weight=True, + ), + ), + ModelGetter( + get_model_fn=get_astype_model, + ov_model_params_kwargs=dict( + input_dtypes={ + "input": TensorDataType.float32, + }, + output_dtypes={ + "output": TensorDataType.bfloat16, + }, + ), + get_model_kwargs=dict( + input_shape=(10, 4), + ), + ), +] + + +@pytest.mark.parametrize( + "model_getter,input_shapes,ref_cache_size", + [ + ( + MODEL_GETTERS[0], + [ + dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)), + dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)), + dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)), + ], + {False: 5, True: 2}, + ), + ( + MODEL_GETTERS[1], + [ + dict(weight_shape=(10, 4)), + dict(weight_shape=(20, 6)), + dict(weight_shape=(20, 8)), + dict(weight_shape=(10, 4, 4)), + dict(weight_shape=(10, 8, 4)), + ], + {False: 5, True: 2}, + ), + ( + MODEL_GETTERS[2], + [ + dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)), + dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)), + dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)), + dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)), + ], + {False: 10, True: 4}, + ), + ( + MODEL_GETTERS[3], + [ + dict(weight_shape=(10, 4)), + dict(weight_shape=(20, 6)), + dict(weight_shape=(20, 8)), + dict(weight_shape=(10, 4, 4)), + dict(weight_shape=(10, 8, 4)), + ], + {False: 10, True: 4}, + ), + ( + MODEL_GETTERS[4], + [ + dict(input_shape=(10, 1)), + dict(input_shape=(10, 2)), + dict(input_shape=(20, 3)), + dict(input_shape=(10, 4, 4)), + dict(input_shape=(10, 8, 4)), + ], + {False: 5, True: 2}, + ), + ], +) +@pytest.mark.parametrize("dynamic_shapes", [False, True]) +def test_dynamic_shapes(model_getter, input_shapes, ref_cache_size, dynamic_shapes): + # Check that model cache contains fewer elements with dynamic shapes enabled + OV_MODEL_CACHE.clear() + for shape_kwargs in input_shapes: + model_getter.get(ov_model_params_kwargs=dict(dynamic_shapes=dynamic_shapes), get_model_kwargs=shape_kwargs) + assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes] + + +@pytest.mark.parametrize("model_getter", MODEL_GETTERS) +@pytest.mark.parametrize("recompile", [True, False]) +def test_recompile(model_getter, recompile): + # Check that with recompilation ov models are not cached + OV_MODEL_CACHE.clear() + model_getter.get(ov_model_params_kwargs=dict(recompile=recompile)) + ref_size = 0 if recompile else (2 if model_getter._get_model_fn == get_compress_decompress_weight_model else 1) + assert len(OV_MODEL_CACHE._cache) == ref_size + + +@pytest.mark.parametrize("model_getter", MODEL_GETTERS) +@pytest.mark.parametrize("return_ov_tensors", [True, False]) +def test_return_ov_tensors(model_getter, return_ov_tensors): + # Check that ov tensors are returned + OV_MODEL_CACHE.clear() + inputs = [] + for input_name, input_dtype in model_getter._ov_model_params_kwargs["input_dtypes"].items(): + input_shape = model_getter._get_model_kwargs.get(f"{input_name}_shape") + inputs.append(Tensor(np.zeros(input_shape, dtype=DTYPE_MAP_NP[input_dtype]))) + + model_run_fn = model_getter.get(ov_model_params_kwargs=dict(return_ov_tensors=return_ov_tensors)) + outputs = model_run_fn(inputs) + + assert all([out.backend == (TensorBackend.ov if return_ov_tensors else TensorBackend.numpy) for out in outputs]) From 69ae5fa871453caec9c31f05f6bb27a5500cb16e Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 13 Nov 2024 16:04:44 +0100 Subject: [PATCH 32/73] cache_results decorator test --- nncf/common/utils/decorators.py | 7 +- .../weight_compression/openvino_modeling.py | 3 +- .../utils/test_cache_results_decorator.py | 133 ++++++++++++++++++ .../test_ov_modeling_compression.py | 3 +- .../openvino/native/test_openvino_modeling.py | 4 +- 5 files changed, 143 insertions(+), 7 deletions(-) create mode 100644 tests/common/utils/test_cache_results_decorator.py diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py index 0542c91e578..b3fd2a0e3ad 100644 --- a/nncf/common/utils/decorators.py +++ b/nncf/common/utils/decorators.py @@ -9,8 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from importlib import import_module import inspect +from importlib import import_module from typing import Any, Callable, Dict, List from nncf.common.logging import nncf_logger @@ -81,6 +81,8 @@ def __contains__(self, item): def cache_results(cache: ResultsCacheContainer): def decorator(func): def wrapper(*args, disable_caching=False, **kwargs): + if disable_caching: + return func(*args, **kwargs) sig = inspect.signature(func) new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} new_kwargs.update(kwargs) @@ -88,8 +90,7 @@ def wrapper(*args, disable_caching=False, **kwargs): if cache_key in cache: return cache[cache_key] result = func(*args, **kwargs) - if not disable_caching: - cache[cache_key] = result + cache[cache_key] = result return result return wrapper diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 2acd9733c82..eb61c6ea5bd 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -18,7 +18,8 @@ import openvino as ov from openvino.runtime import opset13 as opset -from nncf.common.utils.decorators import ResultsCacheContainer, cache_results +from nncf.common.utils.decorators import ResultsCacheContainer +from nncf.common.utils.decorators import cache_results from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.tensor import Tensor from nncf.tensor import TensorDataType diff --git a/tests/common/utils/test_cache_results_decorator.py b/tests/common/utils/test_cache_results_decorator.py new file mode 100644 index 00000000000..599e41a421d --- /dev/null +++ b/tests/common/utils/test_cache_results_decorator.py @@ -0,0 +1,133 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from nncf.common.utils.decorators import ResultsCacheContainer +from nncf.common.utils.decorators import cache_results + +TEST_CACHE_CONTAINER = ResultsCacheContainer() + + +@cache_results(TEST_CACHE_CONTAINER) +def cached_addition(a, b): + return a + b + + +@pytest.mark.parametrize( + "inputs,disable_caching,output,clear_cache,cache_size,ref_cache,ref_access_count", + [ + ( + (1, 2), + False, + 3, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0}, + ), + ( + (1, 2), + False, + 3, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1}, + ), + ( + (2, 3), + True, + 5, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1}, + ), + ( + (3, 4), + False, + 7, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 1, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0, + }, + ), + ( + (1, 2), + False, + 3, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0, + }, + ), + ( + (3, 4), + False, + 7, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1, + }, + ), + ( + (3, 4), + True, + 7, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1, + }, + ), + ((3, 4), True, 7, True, 0, {}, {}), + ( + (1, 2), + False, + 3, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0}, + ), + ], +) +def test_caching_results(inputs, disable_caching, output, clear_cache, cache_size, ref_cache, ref_access_count): + if clear_cache: + TEST_CACHE_CONTAINER.clear() + kwargs = {"disable_caching": True} if disable_caching else {} + assert cached_addition(*inputs, **kwargs) == output + assert len(TEST_CACHE_CONTAINER._cache) == cache_size + assert TEST_CACHE_CONTAINER._cache == ref_cache + assert TEST_CACHE_CONTAINER._access_count == ref_access_count diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py index 682ff604901..d8c6bfa7ffa 100644 --- a/tests/openvino/native/quantization/test_ov_modeling_compression.py +++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py @@ -20,7 +20,8 @@ import pytest from nncf import CompressWeightsMode -from nncf.common.utils.decorators import cache_results, ResultsCacheContainer +from nncf.common.utils.decorators import ResultsCacheContainer +from nncf.common.utils.decorators import cache_results from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py index 71f9cd316fb..14ec9f740ab 100644 --- a/tests/openvino/native/test_openvino_modeling.py +++ b/tests/openvino/native/test_openvino_modeling.py @@ -18,9 +18,9 @@ from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model -from nncf.tensor import TensorDataType, Tensor +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType from nncf.tensor.definitions import TensorBackend - from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP From d0f49aeca9804c421a19f59bbab7b6ccdae395b1 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 13 Nov 2024 16:22:04 +0100 Subject: [PATCH 33/73] get_const_value test --- tests/openvino/native/test_node_utils.py | 63 ++++++++++++++++++------ 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/tests/openvino/native/test_node_utils.py b/tests/openvino/native/test_node_utils.py index fd962d6938b..241b9e6f156 100644 --- a/tests/openvino/native/test_node_utils.py +++ b/tests/openvino/native/test_node_utils.py @@ -29,30 +29,63 @@ @pytest.mark.parametrize( - "precisions", + "precisions,cast_bf16_to_fp32", [ # base FP32 precision - { - "type_for_const": ov.Type.f32, - "ref_type": np.float32, - }, + ( + { + "type_for_const": ov.Type.f32, + "ref_type": np.float32, + }, + True, + ), # base FP16 precision - { - "type_for_const": ov.Type.f16, - "ref_type": np.float16, - }, + ( + { + "type_for_const": ov.Type.f16, + "ref_type": np.float16, + }, + True, + ), # base BF16 precision should be casted to FP32 - { - "type_for_const": ov.Type.bf16, - "ref_type": np.float32, - }, + ( + { + "type_for_const": ov.Type.bf16, + "ref_type": np.float32, + }, + True, + ), + # base FP32 precision, cast_bf16_to_fp32=False has no effect + ( + { + "type_for_const": ov.Type.f32, + "ref_type": np.float32, + }, + False, + ), + # base FP16 precision, cast_bf16_to_fp32=False has no effect + ( + { + "type_for_const": ov.Type.f16, + "ref_type": np.float16, + }, + False, + ), + # with cast_bf16_to_fp32=False BF16 constant is retrieved as FP16 + ( + { + "type_for_const": ov.Type.bf16, + "ref_type": np.float16, + }, + False, + ), ], ) -def test_get_const_value(precisions): +def test_get_const_value(precisions, cast_bf16_to_fp32): const_data = np.ones((1, 2, 3), dtype=np.float32) weight_const = opset.constant(const_data, dtype=precisions["type_for_const"]) - const_value = get_const_value(weight_const) + const_value = get_const_value(weight_const, cast_bf16_to_fp32=cast_bf16_to_fp32) assert const_value.dtype == precisions["ref_type"] From a282976b3952f8c340069a6448fd2dd807425303 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 13 Nov 2024 16:54:57 +0100 Subject: [PATCH 34/73] OVModelParameters minor refactor --- nncf/openvino/graph/node_utils.py | 13 ++ .../weight_compression/openvino_modeling.py | 144 +++++++++++------- .../weight_compression/weight_lowering.py | 31 ++-- nncf/results_caching.py | 55 +++++++ 4 files changed, 168 insertions(+), 75 deletions(-) create mode 100644 nncf/results_caching.py diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 05e759f1b16..24677d52968 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -630,3 +630,16 @@ def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: Tuple channel_axis = activations_layout.index(OVLayoutElem.C_IN) return channel_axis + + +def convert_if_needed(node: ov.Node, target_dtype: ov.Type) -> ov.Node: + """ + Converts the input node to the target data type if it is not already in the target data type. + + :param node: The input node to convert. + :param target_dtype: The target data type to convert the input node to. + :return: The converted node. + """ + if node.get_element_type() == target_dtype: + return node + return opset.convert(node, target_dtype) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index eb61c6ea5bd..d11679a9081 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -20,6 +20,7 @@ from nncf.common.utils.decorators import ResultsCacheContainer from nncf.common.utils.decorators import cache_results +from nncf.openvino.graph.node_utils import convert_if_needed from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.tensor import Tensor from nncf.tensor import TensorDataType @@ -32,22 +33,57 @@ OV_MODEL_CACHE = ResultsCacheContainer() -@dataclass +@dataclass(init=False) class OVModelParameters: - input_dtypes: Optional[Dict[str, TensorDataType]] = None - output_dtypes: Optional[Dict[str, TensorDataType]] = None - dynamic_shapes: bool = False - recompile: bool = False - release_memory: bool = True - share_inputs: bool = True - share_outputs: bool = True - return_ov_tensors: bool = False + def __init__( + self, + input_dtypes: Optional[Dict[str, TensorDataType]] = None, + output_dtypes: Optional[Dict[str, TensorDataType]] = None, + dynamic_shapes: bool = False, + recompile: bool = False, + release_memory: bool = True, + share_inputs: bool = True, + share_outputs: bool = True, + return_ov_tensors: bool = False, + ): + self.input_dtypes = input_dtypes or {} + self.output_dtypes = output_dtypes or {} + self.dynamic_shapes = dynamic_shapes + self.recompile = recompile + self.release_memory = release_memory + self.share_inputs = share_inputs + self.share_outputs = share_outputs + self.return_ov_tensors = return_ov_tensors + + def __copy__(self): + return OVModelParameters( + input_dtypes=self.input_dtypes.copy(), + output_dtypes=self.output_dtypes.copy(), + dynamic_shapes=self.dynamic_shapes, + recompile=self.recompile, + release_memory=self.release_memory, + share_inputs=self.share_inputs, + share_outputs=self.share_outputs, + return_ov_tensors=self.return_ov_tensors, + ) + + def __deepcopy__(self, memo): + return OVModelParameters( + input_dtypes=copy.deepcopy(self.input_dtypes, memo), + output_dtypes=copy.deepcopy(self.output_dtypes, memo), + dynamic_shapes=self.dynamic_shapes, + recompile=self.recompile, + release_memory=self.release_memory, + share_inputs=self.share_inputs, + share_outputs=self.share_outputs, + return_ov_tensors=self.return_ov_tensors, + ) def __hash__(self): return hash( ( - None if self.output_dtypes is None else frozenset(self.input_dtypes.items()), - None if self.output_dtypes is None else frozenset(self.output_dtypes.items()), + frozenset(self.input_dtypes.items()), + frozenset(self.output_dtypes.items()), self.dynamic_shapes, self.recompile, self.release_memory, @@ -158,20 +194,27 @@ def _build_compress_model( zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, -) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]: - input_dtypes = ov_model_params.input_dtypes - if input_dtypes is None: - raise ValueError("Input dtypes must be provided.") - output_dtypes = ov_model_params.output_dtypes - if output_dtypes is None: - raise ValueError("Output dtypes must be provided.") - - weight_dtype = input_dtypes.get("weight") - input_scale_dtype = input_dtypes.get("scale", TensorDataType.float32) - input_zero_point_dtype = input_dtypes.get("zero_point", TensorDataType.int32) - compressed_weight_dtype = output_dtypes.get("compressed_weight") - output_scale_dtype = output_dtypes.get("scale", TensorDataType.float32) - output_zero_point_dtype = output_dtypes.get("zero_point", TensorDataType.int32) +) -> Union[ModelCallable, Tuple[OVModelParameters, List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]: + is_int_asym = config.is_int_asym + default_input_dtypes = { + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + } + default_output_dtypes = { + "compressed_weight": TensorDataType.uint8 if is_int_asym else TensorDataType.int8, + "scale": TensorDataType.float32, + "zero_point": TensorDataType.int32, + } + ov_model_params = copy.deepcopy(ov_model_params) + ov_model_params.input_dtypes = {**default_input_dtypes, **ov_model_params.input_dtypes} + ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes} + + weight_dtype = ov_model_params.input_dtypes["weight"] + input_scale_dtype = ov_model_params.input_dtypes["scale"] + input_zero_point_dtype = ov_model_params.input_dtypes["zero_point"] + compressed_weight_dtype = ov_model_params.output_dtypes["compressed_weight"] + output_scale_dtype = ov_model_params.output_dtypes["scale"] + output_zero_point_dtype = ov_model_params.output_dtypes["zero_point"] # Validate input dtypes valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] @@ -181,25 +224,25 @@ def _build_compress_model( ) if scale_shape is not None and input_scale_dtype != TensorDataType.float32: raise ValueError(f"Input scale must be of float32 data type. But found: {input_scale_dtype}.") - if zero_point_shape is not None and input_zero_point_dtype != TensorDataType.int32: - raise ValueError(f"Input zero point must be of int32 data type. But found: {input_zero_point_dtype}.") + if zero_point_shape is not None and input_zero_point_dtype not in [TensorDataType.int32, TensorDataType.float32]: + raise ValueError(f"Input zero point must be of int32/float32 data type. But found: {input_zero_point_dtype}.") # Validate output dtypes valid_compressed_weight_dtypes = [ + TensorDataType.float32, TensorDataType.int32, TensorDataType.int8, TensorDataType.uint8, TensorDataType.int4, TensorDataType.uint4, ] - if compressed_weight_dtype not in valid_compressed_weight_dtypes + [TensorDataType.float32]: + if compressed_weight_dtype not in valid_compressed_weight_dtypes: raise ValueError( f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. " f"But found: {compressed_weight_dtype}." ) if scale_shape is None and output_scale_dtype != TensorDataType.float32: raise ValueError(f"Output scale must be of float32 data type. But found: {output_scale_dtype}.") - is_int_asym = config.is_int_asym if is_int_asym and zero_point_shape is None and output_zero_point_dtype not in valid_compressed_weight_dtypes: raise ValueError( f"Output zero point must be of one of the following data types: {valid_compressed_weight_dtypes}. " @@ -222,7 +265,7 @@ def _build_compress_model( min_values = None if scale_shape is not None: # Scale is given as an input - scale = opset.parameter(scale_shape, name="scale", dtype=ov.Type.f32) + scale = opset.parameter(scale_shape, name="scale", dtype=DTYPE_MAP_OV[input_scale_dtype]) ov_parameters.append(scale) else: # Compute scale @@ -250,10 +293,10 @@ def _build_compress_model( zero_point = None if zero_point_shape is not None: # Zero point is given as an input - zero_point = opset.parameter(zero_point_shape, name="zero_point", dtype=ov.Type.i32) + zero_point = opset.parameter(zero_point_shape, name="zero_point", dtype=DTYPE_MAP_OV[input_zero_point_dtype]) ov_parameters.append(zero_point) # Cast to float32 for an addition later - zero_point = opset.convert(zero_point, ov.Type.f32) + zero_point = convert_if_needed(zero_point, ov.Type.f32) elif is_int_asym: # Compute zero point if min_values is None: @@ -264,8 +307,7 @@ def _build_compress_model( zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) zero_point = opset.clamp(zero_point, level_low, level_high) - if weight.get_element_type() != ov.Type.f32: - weight = opset.convert(weight, ov.Type.f32) + weight = convert_if_needed(weight, ov.Type.f32) compressed_weight = weight / scale if is_int_asym: @@ -273,18 +315,17 @@ def _build_compress_model( compressed_weight = opset.round(compressed_weight) compressed_weight = opset.clamp(opset.round(compressed_weight), level_low, level_high) - if compressed_weight_dtype != TensorDataType.float32: - compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) + compressed_weight = convert_if_needed(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) ov_results = [compressed_weight] if len(ov_parameters) == 1: ov_results.append(scale) if zero_point is not None: - zero_point = opset.convert(zero_point, DTYPE_MAP_OV[output_zero_point_dtype]) + zero_point = convert_if_needed(zero_point, DTYPE_MAP_OV[output_zero_point_dtype]) ov_results.append(zero_point) if return_nodes: - return ov_parameters, ov_results + return ov_model_params, ov_parameters, ov_results model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") @@ -302,22 +343,17 @@ def _build_compress_decompress_model( reduction_axes: Optional[Tuple] = None, return_compressed_weight: Optional[bool] = False, ) -> ModelCallable: - input_dtypes = ov_model_params.input_dtypes - if input_dtypes is None: - raise ValueError("Input dtypes must be provided.") - output_dtypes = ov_model_params.output_dtypes - if output_dtypes is None: - raise ValueError("Output dtypes must be provided.") + default_output_dtypes = {"decompressed_weight": TensorDataType.float32} + if not return_compressed_weight: + default_output_dtypes["compressed_weight"] = TensorDataType.float32 + ov_model_params = copy.deepcopy(ov_model_params) + ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes} - decompressed_weight_dtype = output_dtypes.get("decompressed_weight") + decompressed_weight_dtype = ov_model_params.output_dtypes["decompressed_weight"] if decompressed_weight_dtype != TensorDataType.float32: raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.") - if "compressed_weight" not in output_dtypes: - ov_model_params = copy.deepcopy(ov_model_params) - ov_model_params.output_dtypes["compressed_weight"] = TensorDataType.float32 - - ov_parameters, ov_results = get_compress_weight_model( + ov_model_params, ov_parameters, ov_results = get_compress_weight_model( ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True ) @@ -330,7 +366,9 @@ def _build_compress_decompress_model( compressed_weight = ov_results[0] scale, zero_point = ov_parameters[1:] - compressed_weight = opset.convert(compressed_weight, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32) + compressed_weight = convert_if_needed(compressed_weight, ov.Type.i32) - convert_if_needed( + zero_point, ov.Type.i32 + ) else: if len(ov_parameters) == 1: # weight -> compressed_weight, scale @@ -340,9 +378,7 @@ def _build_compress_decompress_model( compressed_weight = ov_results[0] scale = ov_parameters[1] - if compressed_weight.get_element_type() != ov.Type.f32: - compressed_weight = opset.convert(compressed_weight, ov.Type.f32) - decompressed_weight = opset.multiply(scale, compressed_weight) + decompressed_weight = opset.multiply(scale, convert_if_needed(compressed_weight, ov.Type.f32)) ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight] model = ov.Model(ov_results, ov_parameters) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 4e1cabc3790..6aa3bdf9867 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -492,16 +492,11 @@ def do_int_quantization( zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params) - ov_model_params.input_dtypes = ov_model_params.input_dtypes or { - "weight": weight.dtype, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - } - ov_model_params.output_dtypes = ov_model_params.output_dtypes or { - "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - } + ov_model_params.input_dtypes["weight"] = weight.dtype + if precomputed_scale is not None: + ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype + if precomputed_zero_point is not None: + ov_model_params.input_dtypes["zero_point"] = precomputed_zero_point.dtype if config.num_bits == 4 and weight.backend == TensorBackend.ov: # Return ov tensors in target precision to seamlessly insert them into openvino model later ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov @@ -596,17 +591,11 @@ def calculate_quantized_dequantized_weight( zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params) - ov_model_params.input_dtypes = ov_model_params.input_dtypes or { - "weight": weight.dtype, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - } - ov_model_params.output_dtypes = ov_model_params.output_dtypes or { - "decompressed_weight": TensorDataType.float32, - "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8, - "scale": TensorDataType.float32, - "zero_point": TensorDataType.int32, - } + ov_model_params.input_dtypes["weight"] = weight.dtype + if precomputed_scale is not None: + ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype + if precomputed_zero_point is not None: + ov_model_params.input_dtypes["zero_point"] = precomputed_zero_point.dtype model = get_compress_decompress_weight_model( ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight diff --git a/nncf/results_caching.py b/nncf/results_caching.py new file mode 100644 index 00000000000..9b314863108 --- /dev/null +++ b/nncf/results_caching.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + + +class ResultsCacheContainer: + def __init__(self): + self._cache = {} + self._access_count = {} + + def clear(self): + self._cache.clear() + self._access_count.clear() + + def is_empty(self): + return len(self._cache) == 0 + + def __getitem__(self, item): + self._access_count[item] += 1 + return self._cache[item] + + def __setitem__(self, key, value): + self._access_count[key] = 0 + self._cache[key] = value + + def __contains__(self, item): + return item in self._cache + + +def cache_results(cache: ResultsCacheContainer): + def decorator(func): + def wrapper(*args, disable_caching=False, **kwargs): + sig = inspect.signature(func) + new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} + new_kwargs.update(kwargs) + cache_key = (func.__name__, frozenset(new_kwargs.items())) + if cache_key in cache: + return cache[cache_key] + result = func(*args, **kwargs) + if not disable_caching: + cache[cache_key] = result + return result + + return wrapper + + return decorator From b13f1865b9deab0b684bb074f840b3285db70f81 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 14 Nov 2024 16:25:06 +0100 Subject: [PATCH 35/73] Added OV tensor tests --- nncf/tensor/functions/ov.py | 50 ++++++++------- tests/openvino/native/test_tensor.py | 94 ++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 22 deletions(-) create mode 100644 tests/openvino/native/test_tensor.py diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index a868d310190..96bb441e45f 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -18,6 +18,7 @@ from nncf.tensor.functions import numeric from ..definitions import TensorBackend +from ..definitions import TensorDeviceType from .numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP from .numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP @@ -37,26 +38,9 @@ DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()} -def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: - from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters - from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model - - a_dtype = DTYPE_MAP_REV[a.get_element_type()] - - model = get_astype_model( - OVModelParameters( - input_dtypes={"input": a_dtype}, - output_dtypes={"output": dtype}, - dynamic_shapes=True, - recompile=False, - release_memory=True, - share_inputs=True, - share_outputs=True, - return_ov_tensors=True, - ), - tuple(a.shape), - ) - return model([Tensor(a)])[0].data +@numeric.device.register(ov.Tensor) +def _(a: ov.Tensor) -> TensorDeviceType: + return TensorDeviceType.CPU @numeric.backend.register(ov.Tensor) @@ -71,7 +55,7 @@ def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: TensorDataType.int4, TensorDataType.uint4, ]: - return _ov_astype(a, dtype) + return _astype_ov(a, dtype) return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype])) @@ -114,6 +98,28 @@ def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]: dtype = TensorDataType.uint8 elif a_dtype == TensorDataType.int4: dtype = TensorDataType.int8 - a = _ov_astype(a, dtype) + a = _astype_ov(a, dtype) return a.data + + +def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: + from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters + from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model + + a_dtype = DTYPE_MAP_REV[a.get_element_type()] + + model = get_astype_model( + OVModelParameters( + input_dtypes={"input": a_dtype}, + output_dtypes={"output": dtype}, + dynamic_shapes=False, + recompile=True, + release_memory=True, + share_inputs=True, + share_outputs=True, + return_ov_tensors=True, + ), + tuple(a.shape), + ) + return model([Tensor(a)])[0].data diff --git a/tests/openvino/native/test_tensor.py b/tests/openvino/native/test_tensor.py new file mode 100644 index 00000000000..e9b1a136a4b --- /dev/null +++ b/tests/openvino/native/test_tensor.py @@ -0,0 +1,94 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import openvino as ov +import pytest + +import openvino.runtime.opset13 as opset +from nncf.tensor import TensorDataType, Tensor +from nncf.tensor.definitions import TensorBackend +from nncf.tensor.definitions import TensorDeviceType +import nncf.tensor.functions as fns +from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP +from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV + + +class TestOVNNCFTensorOperators: + @staticmethod + def to_tensor(x, backend=TensorBackend.ov, dtype=TensorDataType.float32): + if backend == TensorBackend.ov: + if dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]: + ov_const = opset.constant(x, dtype=DTYPE_MAP_OV[dtype]) + return ov.Tensor(ov_const.data, ov_const.data.shape, DTYPE_MAP_OV[dtype]) + else: + return ov.Tensor(np.array(x, dtype=DTYPE_MAP_NP[dtype])) + elif backend == TensorBackend.numpy: + if dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]: + raise ValueError(f"Can't create NumPY tensor in dtype {dtype}") + return np.array(x, dtype=DTYPE_MAP_NP[dtype]) + else: + raise ValueError("Unsupported backend") + + @staticmethod + def backend() -> TensorBackend: + return TensorBackend.ov + + def test_property_backend(self): + tensor_a = Tensor(self.to_tensor([1, 2])) + assert tensor_a.backend == self.backend() + + def test_device(self): + tensor = Tensor(self.to_tensor([1])) + assert tensor.device == TensorDeviceType.CPU + + def test_size(self): + tensor = Tensor(self.to_tensor([1, 1])) + res = tensor.size + assert res == 2 + + def test_astype(self): + tensor = Tensor(self.to_tensor([1])) + res = tensor.astype(TensorDataType.int8) + assert isinstance(res, Tensor) + assert res.dtype == TensorDataType.int8 + assert res.device == tensor.device + + def test_fn_astype(self): + tensor = Tensor(self.to_tensor([1])) + res = fns.astype(tensor, TensorDataType.int8) + assert isinstance(res, Tensor) + assert res.dtype == TensorDataType.int8 + + def test_reshape(self): + tensor = Tensor(self.to_tensor([1, 1])) + res = tensor.reshape((1, 2)) + assert tensor.shape == (2,) + assert res.shape == (1, 2) + assert res.device == tensor.device + + def test_fn_reshape(self): + tensor = Tensor(self.to_tensor([1, 1])) + res = fns.reshape(tensor, (1, 2)) + assert tensor.shape == (2,) + assert res.shape == (1, 2) + assert res.device == tensor.device + + @pytest.mark.parametrize("from_backend", [TensorBackend.numpy, TensorBackend.ov]) + @pytest.mark.parametrize("to_backend", [TensorBackend.numpy, TensorBackend.ov]) + def test_to_backend(self, from_backend, to_backend): + tensor1 = Tensor(self.to_tensor([1], backend=from_backend)) + assert tensor1.backend == from_backend + tensor2 = tensor1.to_backend(to_backend) + assert tensor2.backend == to_backend + assert tensor1.dtype == tensor2.dtype + assert tensor1.shape == tensor2.shape + assert tensor1.device == tensor2.device From 9e90d5abec32f554c66884ff8c84328dee6d6559 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 14 Nov 2024 16:29:06 +0100 Subject: [PATCH 36/73] Minor file reorg --- nncf/{utils.py => import_utils.py} | 0 nncf/results_caching.py | 55 ------------------------------ 2 files changed, 55 deletions(-) rename nncf/{utils.py => import_utils.py} (100%) delete mode 100644 nncf/results_caching.py diff --git a/nncf/utils.py b/nncf/import_utils.py similarity index 100% rename from nncf/utils.py rename to nncf/import_utils.py diff --git a/nncf/results_caching.py b/nncf/results_caching.py deleted file mode 100644 index 9b314863108..00000000000 --- a/nncf/results_caching.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect - - -class ResultsCacheContainer: - def __init__(self): - self._cache = {} - self._access_count = {} - - def clear(self): - self._cache.clear() - self._access_count.clear() - - def is_empty(self): - return len(self._cache) == 0 - - def __getitem__(self, item): - self._access_count[item] += 1 - return self._cache[item] - - def __setitem__(self, key, value): - self._access_count[key] = 0 - self._cache[key] = value - - def __contains__(self, item): - return item in self._cache - - -def cache_results(cache: ResultsCacheContainer): - def decorator(func): - def wrapper(*args, disable_caching=False, **kwargs): - sig = inspect.signature(func) - new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)} - new_kwargs.update(kwargs) - cache_key = (func.__name__, frozenset(new_kwargs.items())) - if cache_key in cache: - return cache[cache_key] - result = func(*args, **kwargs) - if not disable_caching: - cache[cache_key] = result - return result - - return wrapper - - return decorator From 5f46593aaacb4152494f12be1dabde9bc95ff959 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 14 Nov 2024 16:42:29 +0100 Subject: [PATCH 37/73] Tweaks --- .../algorithms/weight_compression/weight_lowering.py | 2 +- nncf/tensor/tensor.py | 6 ------ .../native/quantization/test_ov_modeling_compression.py | 6 +----- tests/openvino/native/test_tensor.py | 7 ++++--- 4 files changed, 6 insertions(+), 15 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 6aa3bdf9867..459af440696 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -17,6 +17,7 @@ import nncf from nncf.common.logging.logger import log_once +from nncf.import_utils import is_openvino_available from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.fake_quantize import calculate_scale_zero_point @@ -24,7 +25,6 @@ from nncf.tensor import functions as fns from nncf.tensor.definitions import TensorBackend from nncf.tensor.definitions import TensorDataType -from nncf.utils import is_openvino_available ReductionAxes = Tuple[int, ...] diff --git a/nncf/tensor/tensor.py b/nncf/tensor/tensor.py index a17758c2ab1..19cba0482a9 100644 --- a/nncf/tensor/tensor.py +++ b/nncf/tensor/tensor.py @@ -116,12 +116,6 @@ def __ipow__(self, other: Union[Tensor, float]) -> Tensor: self._data **= unwrap_tensor_data(other) return self - # def __truediv__(self, other: Union[Tensor, float]) -> Tensor: - # return self * _call_function("_binary_op_nowarn", 1.0, other, operator.truediv) - # - # def __rtruediv__(self, other: Union[Tensor, float]) -> Tensor: - # return other * _call_function("_binary_reverse_op_nowarn", self, 1.0, operator.truediv) - def __truediv__(self, other: Union[Tensor, float]) -> Tensor: return _call_function("_binary_op_nowarn", self, other, operator.truediv) diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py index d8c6bfa7ffa..068795b485b 100644 --- a/tests/openvino/native/quantization/test_ov_modeling_compression.py +++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py @@ -58,8 +58,6 @@ class QuantizationTask(Enum): WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2), ] -DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] - MAX_MISALIGNMENT_FREQUENCY = { TensorDataType.float32: 1e-2, # tends to < 5e-6 TensorDataType.float16: 1e-2, # tends to < 5e-5 @@ -68,8 +66,6 @@ class QuantizationTask(Enum): MAX_MISALIGNMENT_MAGNITUDE = 1 -TENSOR_BACKENDS = [TensorBackend.numpy, TensorBackend.ov] - EPS = np.finfo(np.float32).eps REDUCTION_AXES = (1,) @@ -126,7 +122,7 @@ def openvino_available(available: bool): (QuantizationTask.Q_DQ_RQ, "auto"), ], ) -@pytest.mark.parametrize("dtype", DATA_TYPES) +@pytest.mark.parametrize("dtype", [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]) @pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"]) @pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"]) def test_quantization_alignment( diff --git a/tests/openvino/native/test_tensor.py b/tests/openvino/native/test_tensor.py index e9b1a136a4b..1adb98c5d66 100644 --- a/tests/openvino/native/test_tensor.py +++ b/tests/openvino/native/test_tensor.py @@ -11,13 +11,14 @@ import numpy as np import openvino as ov +import openvino.runtime.opset13 as opset import pytest -import openvino.runtime.opset13 as opset -from nncf.tensor import TensorDataType, Tensor +import nncf.tensor.functions as fns +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType from nncf.tensor.definitions import TensorBackend from nncf.tensor.definitions import TensorDeviceType -import nncf.tensor.functions as fns from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV From e7617f1816a11de673a6e549c18e1969074136fd Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 14 Nov 2024 16:47:50 +0100 Subject: [PATCH 38/73] Tweaks --- .../native/quantization/test_ov_modeling_compression.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py index 068795b485b..3e09714cae0 100644 --- a/tests/openvino/native/quantization/test_ov_modeling_compression.py +++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py @@ -99,12 +99,12 @@ def get_random_integer_tensor(shape, low, high, dtype, backend, seed=0): @contextmanager def openvino_available(available: bool): - import nncf.utils + import nncf.import_utils - original_value = nncf.utils._openvino_available - nncf.utils._openvino_available = available + original_value = nncf.import_utils._openvino_available + nncf.import_utils._openvino_available = available yield - nncf.utils._openvino_available = original_value + nncf.import_utils._openvino_available = original_value @pytest.mark.parametrize("weight_shape", [(10000, 4)], ids=[""]) From 925f830dd9e40f673edb243bd2d71c235c493e2f Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 15 Nov 2024 10:43:50 +0100 Subject: [PATCH 39/73] Switch to OV 2024.5 rc2 --- .github/workflows/precommit.yml | 4 ++-- tests/openvino/native/models.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index 218d9c32fd1..822c360349e 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -64,8 +64,8 @@ jobs: cache: pip - name: Install NNCF and test requirements run: make install-openvino-test - - name: Install OpenVINO Nightly - run: pip install -U --pre openvino==2024.5.0.dev20241015 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - name: Install OpenVINO 2024.5 + run: pip install -U --pre openvino==2024.5.0rc2 openvino-tokenizers==2024.5.0rc2 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - name: Print installed modules run: pip list - name: Run OV precommit test scope diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index 7bca0e5b04f..95f079a8800 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -290,11 +290,11 @@ def __init__(self, const_dtype: ov.Type = ov.Type.f32, input_dtype: ov.Type = ov def _create_ov_model(self): input_shape = [1, 3, 4, 2] input_1 = opset.parameter(input_shape, name="Input", dtype=self.input_dtype) - data = opset.constant(self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const") + data = opset.constant(value=self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const") if self.const_dtype != self.input_dtype: data = opset.convert(data, self.input_dtype.to_string()) matmul = opset.matmul(input_1, data, transpose_a=True, transpose_b=False, name="MatMul") - bias = opset.constant(self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias") + bias = opset.constant(value=self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias") if self.const_dtype != self.input_dtype: bias = opset.convert(bias, self.input_dtype.to_string()) add = opset.add(matmul, bias, name="Add") From 5831fcda2d855226f79a37550a6f92584efe0315 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 15 Nov 2024 11:34:17 +0100 Subject: [PATCH 40/73] Additional tests for ov_modeling --- .../openvino/native/test_openvino_modeling.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py index 14ec9f740ab..4a6f11654c2 100644 --- a/tests/openvino/native/test_openvino_modeling.py +++ b/tests/openvino/native/test_openvino_modeling.py @@ -18,6 +18,7 @@ from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model +from nncf.quantization.algorithms.weight_compression.openvino_modeling import run_model from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor.definitions import TensorBackend @@ -222,3 +223,75 @@ def test_return_ov_tensors(model_getter, return_ov_tensors): outputs = model_run_fn(inputs) assert all([out.backend == (TensorBackend.ov if return_ov_tensors else TensorBackend.numpy) for out in outputs]) + + +@pytest.mark.parametrize("release_memory", [True, False]) +def test_release_memory(mocker, release_memory): + compiled_model = mocker.Mock() + compiled_model.release_memory = mocker.Mock() + + input_mock = mocker.Mock() + input_mock.any_name = "input" + compiled_model.inputs = [input_mock] + + output_mock = mocker.Mock() + compiled_model.return_value = [output_mock] + + ov_model_params = OVModelParameters(input_dtypes={"input": TensorDataType.float32}, release_memory=release_memory) + input_tensor = mocker.Mock() + input_tensor.dtype = TensorDataType.float32 + input_tensor.data = [1, 2, 3] + inputs = [input_tensor] + + run_model(ov_model_params, compiled_model, return_ov_tensors=False, inputs=inputs) + if release_memory: + compiled_model.release_memory.assert_called_once() + else: + compiled_model.release_memory.assert_not_called() + + +@pytest.mark.parametrize("share_inputs", [True, False]) +@pytest.mark.parametrize("share_outputs", [True, False]) +@pytest.mark.parametrize("return_ov_tensors", [True, False]) +def test_share_inputs_outputs(mocker, share_inputs, share_outputs, return_ov_tensors): + compiled_model = mocker.Mock() + + input_mock = mocker.Mock() + input_mock.any_name = "input" + compiled_model.inputs = [input_mock] + + output_mock = mocker.Mock() + + if return_ov_tensors: + infer_request = mocker.Mock() + compiled_model.create_infer_request.return_value = infer_request + + infer_request.infer = mocker.Mock() + infer_request.results = [output_mock] + + infer_request.get_output_tensor.return_value = output_mock + else: + compiled_model.return_value = [output_mock] + + ov_model_params = OVModelParameters( + input_dtypes={"input": TensorDataType.float32}, + return_ov_tensors=return_ov_tensors, + share_inputs=share_inputs, + share_outputs=share_outputs, + ) + + input_tensor = mocker.Mock() + input_tensor.dtype = TensorDataType.float32 + input_tensor.data = [1, 2, 3] + inputs = [input_tensor] + + run_model(ov_model_params, compiled_model, return_ov_tensors=return_ov_tensors, inputs=inputs) + + if return_ov_tensors: + infer_request.infer.assert_called_once_with( + [input_tensor.data], share_inputs=share_inputs, share_outputs=share_outputs + ) + else: + compiled_model.assert_called_once_with( + [input_tensor.data], share_inputs=share_inputs, share_outputs=share_outputs + ) From 9160de3dec73725cd0ba1d287bf23b89e9133883 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 15 Nov 2024 11:44:28 +0100 Subject: [PATCH 41/73] Type hints --- nncf/common/utils/decorators.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py index b3fd2a0e3ad..4d5b2247ba6 100644 --- a/nncf/common/utils/decorators.py +++ b/nncf/common/utils/decorators.py @@ -55,32 +55,32 @@ def wrapped_f(*args: Any, **kwargs: Any): # type: ignore class ResultsCacheContainer: - def __init__(self): - self._cache = {} - self._access_count = {} + def __init__(self) -> None: + self._cache: Dict[Any, Any] = {} + self._access_count: Dict[Any, int] = {} - def clear(self): + def clear(self) -> None: self._cache.clear() self._access_count.clear() - def is_empty(self): + def is_empty(self) -> bool: return len(self._cache) == 0 - def __getitem__(self, item): + def __getitem__(self, item: Any) -> Any: self._access_count[item] += 1 return self._cache[item] - def __setitem__(self, key, value): + def __setitem__(self, key: Any, value: Any) -> None: self._access_count[key] = 0 self._cache[key] = value - def __contains__(self, item): + def __contains__(self, item: Any) -> bool: return item in self._cache -def cache_results(cache: ResultsCacheContainer): - def decorator(func): - def wrapper(*args, disable_caching=False, **kwargs): +def cache_results(cache: ResultsCacheContainer) -> Callable: + def decorator(func: Callable) -> Callable: + def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any: if disable_caching: return func(*args, **kwargs) sig = inspect.signature(func) From c7c63eb34b10b77e0b48229d7be92a94186c75a1 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 15 Nov 2024 11:50:43 +0100 Subject: [PATCH 42/73] Ignore mypy --- nncf/common/utils/decorators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py index 4d5b2247ba6..c2ef9a4fe92 100644 --- a/nncf/common/utils/decorators.py +++ b/nncf/common/utils/decorators.py @@ -78,9 +78,9 @@ def __contains__(self, item: Any) -> bool: return item in self._cache -def cache_results(cache: ResultsCacheContainer) -> Callable: - def decorator(func: Callable) -> Callable: - def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any: +def cache_results(cache: ResultsCacheContainer) -> Callable: # type: ignore + def decorator(func: Callable) -> Callable: # type: ignore + def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any: # type: ignore if disable_caching: return func(*args, **kwargs) sig = inspect.signature(func) From 764f7222caa44e65c25ec9139d6a65b4682b44b7 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 15 Nov 2024 11:56:04 +0100 Subject: [PATCH 43/73] Reuse DTYPE_MAP_REV --- .../weight_compression/openvino_backend.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index b0d0ae79c96..ffec97d080e 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -53,6 +53,7 @@ from nncf.tensor import Tensor from nncf.tensor.definitions import TensorBackend from nncf.tensor.definitions import TensorDataType +from nncf.tensor.functions.ov import DTYPE_MAP_REV class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend): @@ -130,19 +131,9 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov. def get_weight_dtype( self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph ) -> TensorDataType: - # TODO: use from nncf.tensor.functions.ov import DTYPE_MAP ov_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"] - dtype_map = { - "f16": TensorDataType.float16, - "bf16": TensorDataType.bfloat16, - "f32": TensorDataType.float32, - "f64": TensorDataType.float64, - "i8": TensorDataType.int8, - "i32": TensorDataType.int32, - "i64": TensorDataType.int64, - "u8": TensorDataType.uint8, - } - return dtype_map.get(ov_type_name) + ov_type = getattr(ov.Type, ov_type_name) + return DTYPE_MAP_REV[ov_type] @staticmethod def get_weight_shape(node_with_weight: NNCFNode, weight_port_id: int, graph: NNCFGraph) -> Tuple: From 4a448e1c46ae89a24c602663053f857279163677 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 18 Nov 2024 16:22:13 +0100 Subject: [PATCH 44/73] Added docstrings --- nncf/common/logging/logger.py | 7 +- nncf/common/utils/decorators.py | 15 +++ nncf/import_utils.py | 4 + nncf/openvino/graph/node_utils.py | 2 + .../weight_compression/openvino_backend.py | 16 ++- .../weight_compression/openvino_modeling.py | 126 ++++++++++++++---- .../weight_compression/weight_lowering.py | 58 ++++---- nncf/quantization/fake_quantize.py | 4 +- nncf/tensor/functions/__init__.py | 4 +- nncf/tensor/functions/numeric.py | 10 +- nncf/tensor/functions/ov.py | 7 + .../openvino/native/test_openvino_modeling.py | 6 +- 12 files changed, 188 insertions(+), 71 deletions(-) diff --git a/nncf/common/logging/logger.py b/nncf/common/logging/logger.py index e13fcaa8442..5b02bbb77f1 100644 --- a/nncf/common/logging/logger.py +++ b/nncf/common/logging/logger.py @@ -90,5 +90,10 @@ def warn_bkc_version_mismatch(backend: str, bkc_version: str, current_version: s @lru_cache(None) -def log_once(level, message): +def log_once(level: int, message: str) -> None: + """ + Logs a message only once. + :param level: Logging level, e.g. logging.WARNING. + :param message: The message to log. + """ nncf_logger.log(level, message) diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py index c2ef9a4fe92..5f9f14eaf4c 100644 --- a/nncf/common/utils/decorators.py +++ b/nncf/common/utils/decorators.py @@ -55,8 +55,14 @@ def wrapped_f(*args: Any, **kwargs: Any): # type: ignore class ResultsCacheContainer: + """ + A container for results decorated with @cache_results decorator. + """ + def __init__(self) -> None: + # Stores the results of the decorated function self._cache: Dict[Any, Any] = {} + # Stores the number of times the cached result was accessed self._access_count: Dict[Any, int] = {} def clear(self) -> None: @@ -79,6 +85,15 @@ def __contains__(self, item: Any) -> bool: def cache_results(cache: ResultsCacheContainer) -> Callable: # type: ignore + """ + Decorator to cache the results of a function. + + Decorated function additionally accepts a `disable_caching` argument do disable caching if needed. If it is True, + the result will not be stored saved to a cache. Also, if there is a corresponding result in the cache, it will be + recomputed. + :param cache: A cache container where results will be stored. + """ + def decorator(func: Callable) -> Callable: # type: ignore def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any: # type: ignore if disable_caching: diff --git a/nncf/import_utils.py b/nncf/import_utils.py index 50a315e4048..3608deeae20 100644 --- a/nncf/import_utils.py +++ b/nncf/import_utils.py @@ -29,4 +29,8 @@ def is_openvino_available(): + """ + Check if OpenVINO is available. + :return: True if openvino package is installed, False otherwise. + """ return _openvino_available diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 24677d52968..67bf9143cd4 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -113,6 +113,8 @@ def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = Tru This method is applicable only for the floating-point constant data. :param const_node: OpenVINO node. + :param cast_bf16_to_fp32: Whether to cast bf16 node data to fp32 or not. If False and the node contains bf16 data, + the resulting bf16 value will be returned encoded inside a numpy.float16 array. :return: The constant value. """ if const_node.get_element_type() == ov.Type.bf16 and cast_bf16_to_fp32: diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index ffec97d080e..49e842f72d5 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -30,6 +30,7 @@ from nncf.openvino.graph.metatypes import openvino_metatypes as om from nncf.openvino.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS from nncf.openvino.graph.model_transformer import OVModelTransformer +from nncf.openvino.graph.node_utils import convert_if_needed from nncf.openvino.graph.node_utils import get_const_value from nncf.openvino.graph.node_utils import get_weight_channel_axes from nncf.openvino.graph.transformations.command_creation import OVCommandCreator @@ -242,8 +243,7 @@ def _create_compression_subgraph( compressed_const = self._create_ov_const_from_tensor( compressed_weight.tensor, compression_dtype, name=const_node_name ) - if compressed_const.get_element_type() != compression_dtype: - compressed_const = opset.convert(compressed_const, compression_dtype) + compressed_const = convert_if_needed(compressed_const, compression_dtype) converted_const = opset.convert(compressed_const, ov.Type.f16) if compressed_weight.zero_point is not None: @@ -258,8 +258,7 @@ def _create_compression_subgraph( scale_const = self._create_ov_const_from_tensor( compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale" ) - if scale_const.get_element_type() != ov.Type.f16: - scale_const = opset.convert(scale_const, ov.Type.f16) + scale_const = convert_if_needed(scale_const, ov.Type.f16) mul = opset.multiply( converted_const, @@ -338,6 +337,7 @@ def transform_model( # reset name_to_node_mapping self.name_to_node_mapping = None + # clear openvino model cache OV_MODEL_CACHE.clear() return model @@ -350,6 +350,14 @@ def dump_parameters( @staticmethod def _create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant: + """ + Create an OpenVINO Constant node from the given tensor. + :param x: Data tensor. Supports NumPy and OV tensor backends. If x backend is OV, the constant node is created + directly from underlying OV tensor. + :param dtype: Data type of the constant. + :param name: Optional name of the constant. + :return: OpenVINO Constant node. + """ if x.backend == TensorBackend.ov: assert x.data.get_element_type() == dtype return opset.constant(x.data, name=name) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index d11679a9081..69abd5309b1 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -16,6 +16,8 @@ import numpy as np import openvino as ov +from openvino._pyopenvino.op import Parameter +from openvino.runtime import Node from openvino.runtime import opset13 as opset from nncf.common.utils.decorators import ResultsCacheContainer @@ -35,6 +37,10 @@ @dataclass(init=False) class OVModelParameters: + """ + A class to hold parameters for building and inferring an OpenVINO model. + """ + def __init__( self, input_dtypes: Optional[Dict[str, TensorDataType]] = None, @@ -46,6 +52,18 @@ def __init__( share_outputs: bool = True, return_ov_tensors: bool = False, ): + """ + :param input_dtypes: Optional dictionary mapping input names to their data types. + :param output_dtypes: Optional dictionary mapping output names to their data types. + :param dynamic_shapes: Whether to use dynamic shapes for the model. When dynamic shapes are used and + recompile is False, it allows to save on the number of models stored in a model cache. + :param recompile: Whether to recompile the model before every inference. Otherwise, compiled models are cached. + :param release_memory: Whether to release memory after every inference. If memory is released, it will be + reallocated during every inference, reducing performance to some extent. + :param share_inputs: Whether to share input tensors. Avoids cloning inputs for inference. + :param share_outputs: Whether to share output tensors. Avoids cloning outputs after the inference. + :param return_ov_tensors: Whether to return results as OpenVINO tensors or NumPy arrays. + """ self.input_dtypes = input_dtypes or {} self.output_dtypes = output_dtypes or {} self.dynamic_shapes = dynamic_shapes @@ -94,9 +112,19 @@ def __hash__(self): ) -def run_model( - ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList +ModelAsNodes = Tuple[List[Parameter], List[Node], OVModelParameters] + + +def _infer_ov_model( + ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList ) -> TensorList: + """ + Run compiled OpenVINO model inference on the given inputs. + :param ov_model_params: OV model related parameters. + :param compiled_model: Compiled OpenVINO model. + :param inputs: Input tensors. + :return: List of output tensors. Tensor backend is OV if return_ov_tensors is True, else NumPy. + """ # Check that input dtypes match the expected dtypes for i, inp in enumerate(compiled_model.inputs): input_name = inp.any_name @@ -107,7 +135,7 @@ def run_model( # Infer the model inputs = [inp.data for inp in inputs] - if return_ov_tensors: + if ov_model_params.return_ov_tensors: infer_request = compiled_model.create_infer_request() infer_request.infer( inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs @@ -134,10 +162,28 @@ def get_compress_weight_model( zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, return_nodes: Optional[bool] = False, -) -> ModelCallable: +) -> Union[ModelCallable, ModelAsNodes]: + """ + Get a model that compresses weights using the given configuration. + :param ov_model_params: OV model parameters. + :param config: Compression configuration. + :param weight_shape: Shape of the weight to compress. Weight is assumed to be already reshaped as needed. + :param scale_shape: Optional shape of the scale. If not provided, scale will be computed by the OV model. + Otherwise, it is expected that the scale tensor is given as an input to the model. + :param zero_point_shape: Optional shape of the zero point tensor. If not provided and the mode is asymmetric, + zero point will be computed by the OV model. Otherwise, it is expected that the zero point tensor is provided + as an input. + :param reduction_axes: Optional axes to reduce the weight tensor. Not needed if scale (and z.p.) are provided as + inputs. + :param return_nodes: Whether to return the OV model inputs parameters and results nodes instead of the model + callable. + :return: A model callable that compresses weights using the given configuration. Or a model as nodes, if + `return_nodes` is True. + """ if scale_shape is None and zero_point_shape is not None: raise Exception("Zero point shape can only be provided if scale shape is provided.") + # Set dynamic shapes if needed if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) if scale_shape is not None: @@ -166,6 +212,25 @@ def get_compress_decompress_weight_model( reduction_axes: Optional[Tuple] = None, return_compressed_weight: Optional[bool] = False, ) -> ModelCallable: + """ + Get a model that performs compression and decompression of the given weight. + :param ov_model_params: OV model parameters. + :param config: Compression configuration. + :param weight_shape: Shape of the weight. Weight is assumed to be already reshaped as needed. + :param scale_shape: Optional shape of the scale. If not provided, scale will be computed by the OV model. + Otherwise, it is expected that the scale tensor is given as an input to the model. + :param zero_point_shape: Optional shape of the zero point tensor. If not provided and the mode is asymmetric, + zero point will be computed by the OV model. Otherwise, it is expected that the zero point is provided as an + input. + :param reduction_axes: Optional axes to reduce the weight tensor. Not needed if scale (and z.p.) are provided as + inputs. + :param return_compressed_weight: Whether to also return compressed weight, scale, (and zero point) besides the + decompressed weight. + :return: A model callable that returns a decompressed weight, and optionally compressed weight, scale, + (and zero point) if `return_compressed_weight` is True. + """ + + # Set dynamic shapes if needed if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) if scale_shape is not None: @@ -194,8 +259,9 @@ def _build_compress_model( zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, -) -> Union[ModelCallable, Tuple[OVModelParameters, List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]: +) -> Union[ModelCallable, ModelAsNodes]: is_int_asym = config.is_int_asym + default_input_dtypes = { "scale": TensorDataType.float32, "zero_point": TensorDataType.int32, @@ -205,10 +271,15 @@ def _build_compress_model( "scale": TensorDataType.float32, "zero_point": TensorDataType.int32, } + + # Update input and output dtypes with the default values ov_model_params = copy.deepcopy(ov_model_params) ov_model_params.input_dtypes = {**default_input_dtypes, **ov_model_params.input_dtypes} ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes} + if "weight" not in ov_model_params.input_dtypes: + raise ValueError("Input weight dtype is required!") + weight_dtype = ov_model_params.input_dtypes["weight"] input_scale_dtype = ov_model_params.input_dtypes["scale"] input_zero_point_dtype = ov_model_params.input_dtypes["zero_point"] @@ -255,12 +326,8 @@ def _build_compress_model( num_bits = config.num_bits eps = np.finfo(np.float32).eps - if is_int_asym: - level_low = 0 - level_high = 2**num_bits - 1 - else: - level_low = -(2 ** (num_bits - 1)) - level_high = 2 ** (num_bits - 1) - 1 + level_low = 0 if is_int_asym else -(2 ** (num_bits - 1)) + level_high = 2**num_bits - 1 if is_int_asym else 2 ** (num_bits - 1) - 1 min_values = None if scale_shape is not None: @@ -270,12 +337,9 @@ def _build_compress_model( else: # Compute scale if is_int_asym: - min_values = opset.reduce_min( - weight, reduction_axes=reduction_axes, keep_dims=True - ) # [a1, r, a2] -> [a1, 1, a2] - max_values = opset.reduce_max( - weight, reduction_axes=reduction_axes, keep_dims=True - ) # [a1, r, a2] -> [a1, 1, a2] + # [a1, r, a2] -> [a1, 1, a2] + min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True) + max_values = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) levels = level_high - level_low + 1 @@ -300,9 +364,8 @@ def _build_compress_model( elif is_int_asym: # Compute zero point if min_values is None: - min_values = opset.reduce_min( - weight, reduction_axes=reduction_axes, keep_dims=True - ) # [a1, r, a2] -> [a1, 1, a2] + # [a1, r, a2] -> [a1, 1, a2] + min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True) min_values = opset.convert(min_values, ov.Type.f32) zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) zero_point = opset.clamp(zero_point, level_low, level_high) @@ -325,12 +388,12 @@ def _build_compress_model( ov_results.append(zero_point) if return_nodes: - return ov_model_params, ov_parameters, ov_results + return ov_parameters, ov_results, ov_model_params model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) + return partial(_infer_ov_model, ov_model_params, compiled_model) @cache_results(OV_MODEL_CACHE) @@ -345,6 +408,7 @@ def _build_compress_decompress_model( ) -> ModelCallable: default_output_dtypes = {"decompressed_weight": TensorDataType.float32} if not return_compressed_weight: + # If compressed weight is not returned to a user, we can keep it in float32 to avoid additional conversion default_output_dtypes["compressed_weight"] = TensorDataType.float32 ov_model_params = copy.deepcopy(ov_model_params) ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes} @@ -353,7 +417,8 @@ def _build_compress_decompress_model( if decompressed_weight_dtype != TensorDataType.float32: raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.") - ov_model_params, ov_parameters, ov_results = get_compress_weight_model( + # Get compression model as input/result nodes and potentially modified ov model parameters + ov_parameters, ov_results, ov_model_params = get_compress_weight_model( ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True ) @@ -384,10 +449,21 @@ def _build_compress_decompress_model( model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) + return partial(_infer_ov_model, ov_model_params, compiled_model) def get_astype_model(ov_model_params: OVModelParameters, input_shape: Tuple) -> ModelCallable: + """ + Return a model that cast the input of the given shape to the given data type. Especially useful for + casting from/to data types not supported by NumPy such as bfloat16, uint4 and int4. + These data types are represented as the following data types in numpy: + - bfloat16 -> np.float16, + - uint4 -> uint8, + - int4 -> int8. + :param ov_model_params: OV model related parameters. + :param input_shape: Shape of the tensor to cast. + :return: A model callable that casts the input tensor to the given data type. + """ if ov_model_params.dynamic_shapes: input_shape = (-1,) * len(input_shape) return _build_astype_model(ov_model_params, input_shape, disable_caching=ov_model_params.recompile) @@ -411,4 +487,4 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> model = ov.Model([res], [arg]) compiled_model = ov.compile_model(model, device_name="CPU") - return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) + return partial(_infer_ov_model, ov_model_params, compiled_model) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 459af440696..339154ffa52 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -158,7 +158,7 @@ def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bi w_max = fns.max(weight, axis=reduction_axes, keepdims=True) scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max) - fns.inplace_divide(scale, level_high) + fns.inplace_inverted_divide(scale, level_high) eps = fns.finfo(scale).eps scale = fns.where(fns.abs(scale) < eps, eps, scale) @@ -179,7 +179,7 @@ def calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor: if scale.dtype != TensorDataType.float32: scale = scale.astype(TensorDataType.float32) - return fns.divide(weight, scale) + return fns.inverted_divide(weight, scale) def do_nf4_quantization(weight: Tensor, scale: Tensor, is_normalized_weight: bool = False) -> Tensor: @@ -312,7 +312,7 @@ def calculate_quantized_weight( level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - compressed_weights = fns.divide(weight, scale) + compressed_weights = fns.inverted_divide(weight, scale) if zero_point is not None: compressed_weights += zero_point.astype(weight.dtype) compressed_weights = fns.round(compressed_weights) @@ -430,7 +430,7 @@ def do_int_quantization( precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, ov_model_params: Optional = None, -): +) -> Tuple[Tensor, Tensor, Tensor]: """ Performs integer quantization on the given weight tensor. @@ -450,14 +450,9 @@ def do_int_quantization( "for asymmetric quantization." ) - # import os - accelerate_through_ov = ( - is_openvino_available() - and weight.backend != TensorBackend.torch - # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) - ) + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch if not is_openvino_available() and weight.backend != TensorBackend.torch: - log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") + log_once(logging.INFO, "Running time may be improved after installing OpenVINO") # When reduction axes are not provided, assuming that the weights are already reshaped if config.group_size != -1 and reduction_axes is not None: @@ -466,15 +461,15 @@ def do_int_quantization( if not accelerate_through_ov: # Reference implementation - if weight.backend == TensorBackend.ov: weight = weight.to_backend(TensorBackend.numpy) - if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) scale, zero_point = None, None if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None): + if reduction_axes is None: + raise ValueError("Reduction axes are required for computing the scale and (zero point) vectors.") scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) if precomputed_scale is not None: scale = precomputed_scale @@ -505,11 +500,6 @@ def do_int_quantization( {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype} ) - # ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) - # ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) - # ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) - # ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) - model = get_compress_weight_model( ov_model_params, config, @@ -553,25 +543,29 @@ def calculate_quantized_dequantized_weight( return_compressed_weight: Optional[bool] = False, ov_model_params: Optional = None, ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: - # import os - accelerate_through_ov = ( - is_openvino_available() - and weight.backend != TensorBackend.torch - # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) - ) + """ + First quantizes the given weight tensor and then dequantizes it back to obtain float32 values. + :param weight: The weight tensor to quantize-dequantize. + :param config: Compression configuration. + :param reduction_axes: Axes along which to reduce (collect) statistics (e.g., min, max). Not required if + precomputed scale (and zero point) are provided. + :param precomputed_scale: Optional precomputed scale tensor. + :param precomputed_zero_point: Optional precomputed zero point tensor. + :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight, scale, + (and zero point). + :param ov_model_params: OpenVINO model parameters for acceleration. + :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale, + (and zero point). + """ + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") if not accelerate_through_ov: # Reference implementation - if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None): - compressed_weight, scale, zero_point = do_int_quantization( - weight, config, reduction_axes, precomputed_scale, precomputed_zero_point - ) - else: - scale = precomputed_scale if precomputed_scale is not None else None - zero_point = precomputed_zero_point if precomputed_zero_point is not None else None - compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point) + compressed_weight, scale, zero_point = do_int_quantization( + weight, config, reduction_axes, precomputed_scale, precomputed_zero_point + ) decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point) if return_compressed_weight: return decompressed_weight, compressed_weight, scale, zero_point diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py index 385cef9ca2e..9b258e40d56 100644 --- a/nncf/quantization/fake_quantize.py +++ b/nncf/quantization/fake_quantize.py @@ -359,11 +359,11 @@ def calculate_scale_zero_point( :return: Scale and Zero point values. """ levels = level_high - level_low if narrow_range else level_high - level_low + 1 - scale = fns.divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32) + scale = fns.inverted_divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32) eps = fns.finfo(scale).eps # NOTE: adding machine epsilon to avoid division by zero scale = fns.where(fns.abs(scale) < eps, eps, scale) expected_level_low = level_low + 1 if narrow_range else level_low - zero_point = expected_level_low - fns.round(fns.divide(input_low, scale)) + zero_point = expected_level_low - fns.round(fns.inverted_divide(input_low, scale)) zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high) return scale, zero_point diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py index 52bc666dfa3..bacd09ee2bf 100644 --- a/nncf/tensor/functions/__init__.py +++ b/nncf/tensor/functions/__init__.py @@ -24,14 +24,14 @@ from nncf.tensor.functions.numeric import count_nonzero as count_nonzero from nncf.tensor.functions.numeric import device as device from nncf.tensor.functions.numeric import diag as diag -from nncf.tensor.functions.numeric import divide as divide from nncf.tensor.functions.numeric import dtype as dtype from nncf.tensor.functions.numeric import expand_dims as expand_dims from nncf.tensor.functions.numeric import eye as eye from nncf.tensor.functions.numeric import finfo as finfo from nncf.tensor.functions.numeric import flatten as flatten from nncf.tensor.functions.numeric import from_numpy as from_numpy -from nncf.tensor.functions.numeric import inplace_divide as inplace_divide +from nncf.tensor.functions.numeric import inplace_inverted_divide as inplace_inverted_divide +from nncf.tensor.functions.numeric import inverted_divide as inverted_divide from nncf.tensor.functions.numeric import isclose as isclose from nncf.tensor.functions.numeric import isempty as isempty from nncf.tensor.functions.numeric import item as item diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py index c6276a5e22f..9ce0876f191 100644 --- a/nncf/tensor/functions/numeric.py +++ b/nncf/tensor/functions/numeric.py @@ -910,12 +910,18 @@ def ceil(a: Tensor) -> Tensor: @functools.singledispatch @tensor_guard def to_backend(a: Tensor, b: TensorBackend) -> Tensor: + """ + Change backend of the tensor to the given one. + :param a: Tensor to change backend for. + :param b: Target backend to change to. + :return: Tensor in the target backend. + """ return Tensor(to_backend(a.data, b)) @functools.singledispatch @tensor_guard -def divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor: +def inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor: """ Divide two tensors or a tensor and a float. This function divides `a` by `b`. If `invert` is True, it performs the division as `a * (1.0 / b)`. @@ -931,7 +937,7 @@ def divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bo @functools.singledispatch @tensor_guard -def inplace_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None: +def inplace_inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None: """ In-place division of two tensors or a tensor and a float. This function divides `a` by `b` in place. If `invert` is True, it performs the division as `a *= (1.0 / b)`. diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index 96bb441e45f..a316d76ac43 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -55,6 +55,7 @@ def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: TensorDataType.int4, TensorDataType.uint4, ]: + # Cannot cast to/from bfloat16, uint4, int4 directly return _astype_ov(a, dtype) return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype])) @@ -104,6 +105,12 @@ def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]: def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: + """ + Cast to a different data type using an OpenVINO model. + :param a: Tensor to cast. + :param dtype: Data type to cast to. + :return: Casted openvino tensor. + """ from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py index 4a6f11654c2..b4bb991d592 100644 --- a/tests/openvino/native/test_openvino_modeling.py +++ b/tests/openvino/native/test_openvino_modeling.py @@ -15,10 +15,10 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters +from nncf.quantization.algorithms.weight_compression.openvino_modeling import _infer_ov_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model -from nncf.quantization.algorithms.weight_compression.openvino_modeling import run_model from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor.definitions import TensorBackend @@ -243,7 +243,7 @@ def test_release_memory(mocker, release_memory): input_tensor.data = [1, 2, 3] inputs = [input_tensor] - run_model(ov_model_params, compiled_model, return_ov_tensors=False, inputs=inputs) + _infer_ov_model(ov_model_params, compiled_model, inputs=inputs) if release_memory: compiled_model.release_memory.assert_called_once() else: @@ -285,7 +285,7 @@ def test_share_inputs_outputs(mocker, share_inputs, share_outputs, return_ov_ten input_tensor.data = [1, 2, 3] inputs = [input_tensor] - run_model(ov_model_params, compiled_model, return_ov_tensors=return_ov_tensors, inputs=inputs) + _infer_ov_model(ov_model_params, compiled_model, inputs=inputs) if return_ov_tensors: infer_request.infer.assert_called_once_with( From 73f61fca98e69017803a79394e1a267bd17499b0 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 14:46:42 +0100 Subject: [PATCH 45/73] Remove inverted NP division. Add non-convertable OV division. --- .../weight_compression/openvino_modeling.py | 18 +++++++--- .../weight_compression/weight_lowering.py | 6 ++-- nncf/quantization/fake_quantize.py | 4 +-- nncf/tensor/functions/__init__.py | 2 -- nncf/tensor/functions/numeric.py | 35 ------------------- 5 files changed, 19 insertions(+), 46 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 69abd5309b1..d4ed33e6f73 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -343,7 +343,7 @@ def _build_compress_model( min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) levels = level_high - level_low + 1 - scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32) + scale = _non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32)) scale = opset.select(opset.abs(scale) < eps, eps, scale) else: w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) @@ -351,7 +351,7 @@ def _build_compress_model( w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max)) - scale /= opset.constant(-level_low, ov.Type.f32) + scale = _non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32)) scale = opset.select(opset.abs(scale) < eps, eps, scale) zero_point = None @@ -367,11 +367,12 @@ def _build_compress_model( # [a1, r, a2] -> [a1, 1, a2] min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True) min_values = opset.convert(min_values, ov.Type.f32) - zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) + scaled_min_values = _non_convertable_divide(min_values, scale) + zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values) zero_point = opset.clamp(zero_point, level_low, level_high) weight = convert_if_needed(weight, ov.Type.f32) - compressed_weight = weight / scale + compressed_weight = _non_convertable_divide(weight, scale) if is_int_asym: compressed_weight += zero_point @@ -488,3 +489,12 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> compiled_model = ov.compile_model(model, device_name="CPU") return partial(_infer_ov_model, ov_model_params, compiled_model) + + +def _non_convertable_divide(a: Node, b: Node) -> Node: + """ + Creates a "non-convertable" divide operation. It won't be converted to a*(1/b). + """ + divide_node = a / b + divide_node.get_rt_info()["nonconvertable_divide_0"] = True + return divide_node diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 339154ffa52..263d457a3e3 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -158,7 +158,7 @@ def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bi w_max = fns.max(weight, axis=reduction_axes, keepdims=True) scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max) - fns.inplace_inverted_divide(scale, level_high) + scale /= level_high eps = fns.finfo(scale).eps scale = fns.where(fns.abs(scale) < eps, eps, scale) @@ -179,7 +179,7 @@ def calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor: if scale.dtype != TensorDataType.float32: scale = scale.astype(TensorDataType.float32) - return fns.inverted_divide(weight, scale) + return weight / scale def do_nf4_quantization(weight: Tensor, scale: Tensor, is_normalized_weight: bool = False) -> Tensor: @@ -312,7 +312,7 @@ def calculate_quantized_weight( level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - compressed_weights = fns.inverted_divide(weight, scale) + compressed_weights = weight / scale if zero_point is not None: compressed_weights += zero_point.astype(weight.dtype) compressed_weights = fns.round(compressed_weights) diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py index 9b258e40d56..cd72bd5ce4f 100644 --- a/nncf/quantization/fake_quantize.py +++ b/nncf/quantization/fake_quantize.py @@ -359,11 +359,11 @@ def calculate_scale_zero_point( :return: Scale and Zero point values. """ levels = level_high - level_low if narrow_range else level_high - level_low + 1 - scale = fns.inverted_divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32) + scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32) eps = fns.finfo(scale).eps # NOTE: adding machine epsilon to avoid division by zero scale = fns.where(fns.abs(scale) < eps, eps, scale) expected_level_low = level_low + 1 if narrow_range else level_low - zero_point = expected_level_low - fns.round(fns.inverted_divide(input_low, scale)) + zero_point = expected_level_low - fns.round(input_low / scale) zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high) return scale, zero_point diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py index bacd09ee2bf..9affab79c90 100644 --- a/nncf/tensor/functions/__init__.py +++ b/nncf/tensor/functions/__init__.py @@ -30,8 +30,6 @@ from nncf.tensor.functions.numeric import finfo as finfo from nncf.tensor.functions.numeric import flatten as flatten from nncf.tensor.functions.numeric import from_numpy as from_numpy -from nncf.tensor.functions.numeric import inplace_inverted_divide as inplace_inverted_divide -from nncf.tensor.functions.numeric import inverted_divide as inverted_divide from nncf.tensor.functions.numeric import isclose as isclose from nncf.tensor.functions.numeric import isempty as isempty from nncf.tensor.functions.numeric import item as item diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py index 9ce0876f191..4d73549f9b4 100644 --- a/nncf/tensor/functions/numeric.py +++ b/nncf/tensor/functions/numeric.py @@ -917,38 +917,3 @@ def to_backend(a: Tensor, b: TensorBackend) -> Tensor: :return: Tensor in the target backend. """ return Tensor(to_backend(a.data, b)) - - -@functools.singledispatch -@tensor_guard -def inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor: - """ - Divide two tensors or a tensor and a float. - This function divides `a` by `b`. If `invert` is True, it performs the division as `a * (1.0 / b)`. - Otherwise, it performs the division as `a / b`. - :param a: The first input tensor or float. - :param b: The second input tensor or float. - :param invert: If True, the division is performed as `a * (1.0 / b)`. If False, it is performed as `a / b`. - Defaults to True. - :return: A new tensor resulting from the division. - """ - return Tensor(a * (1.0 / b) if invert else a / b) - - -@functools.singledispatch -@tensor_guard -def inplace_inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None: - """ - In-place division of two tensors or a tensor and a float. - This function divides `a` by `b` in place. If `invert` is True, it performs the division as `a *= (1.0 / b)`. - Otherwise, it performs the division as `a /= b`. - :param a: The first input tensor or float. - :param b: The second input tensor or float. - :param invert: If True, the division is performed as `a *= (1.0 / b)`. If False, the division it is as `a /= b`. - Defaults to True. - :return: None. The operation is performed in place. - """ - if invert: - a *= 1.0 / b - else: - a /= b From cd884ebba8df966228ca06372985812ebb1dd462 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 14:54:51 +0100 Subject: [PATCH 46/73] Remove OV 2024.5 RC installation --- .github/workflows/precommit.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index b084db6ae23..6dd6bff293d 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -64,8 +64,6 @@ jobs: cache: pip - name: Install NNCF and test requirements run: make install-openvino-test - - name: Install OpenVINO 2024.5 - run: pip install -U --pre openvino==2024.5.0rc2 openvino-tokenizers==2024.5.0rc2 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - name: Print installed modules run: pip list - name: Run OV precommit test scope From 608cfe9f105d1a9cc99c40dcd71737b323fbe254 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 15:08:29 +0100 Subject: [PATCH 47/73] Add a test for non-convertable division --- nncf/openvino/graph/node_utils.py | 9 +++++++++ .../weight_compression/openvino_modeling.py | 18 +++++------------- tests/openvino/native/test_node_utils.py | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 67bf9143cd4..a34f3c9d785 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -645,3 +645,12 @@ def convert_if_needed(node: ov.Node, target_dtype: ov.Type) -> ov.Node: if node.get_element_type() == target_dtype: return node return opset.convert(node, target_dtype) + + +def non_convertable_divide(a: ov.Node, b: ov.Node) -> ov.Node: + """ + Creates a "non-convertable" divide operation. It won't be converted to a*(1/b). + """ + divide_node = a / b + divide_node.get_rt_info()["nonconvertable_divide_0"] = True + return divide_node diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index d4ed33e6f73..54bb083a711 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -23,6 +23,7 @@ from nncf.common.utils.decorators import ResultsCacheContainer from nncf.common.utils.decorators import cache_results from nncf.openvino.graph.node_utils import convert_if_needed +from nncf.openvino.graph.node_utils import non_convertable_divide from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.tensor import Tensor from nncf.tensor import TensorDataType @@ -343,7 +344,7 @@ def _build_compress_model( min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) levels = level_high - level_low + 1 - scale = _non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32)) + scale = non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32)) scale = opset.select(opset.abs(scale) < eps, eps, scale) else: w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) @@ -351,7 +352,7 @@ def _build_compress_model( w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max)) - scale = _non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32)) + scale = non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32)) scale = opset.select(opset.abs(scale) < eps, eps, scale) zero_point = None @@ -367,12 +368,12 @@ def _build_compress_model( # [a1, r, a2] -> [a1, 1, a2] min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True) min_values = opset.convert(min_values, ov.Type.f32) - scaled_min_values = _non_convertable_divide(min_values, scale) + scaled_min_values = non_convertable_divide(min_values, scale) zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values) zero_point = opset.clamp(zero_point, level_low, level_high) weight = convert_if_needed(weight, ov.Type.f32) - compressed_weight = _non_convertable_divide(weight, scale) + compressed_weight = non_convertable_divide(weight, scale) if is_int_asym: compressed_weight += zero_point @@ -489,12 +490,3 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> compiled_model = ov.compile_model(model, device_name="CPU") return partial(_infer_ov_model, ov_model_params, compiled_model) - - -def _non_convertable_divide(a: Node, b: Node) -> Node: - """ - Creates a "non-convertable" divide operation. It won't be converted to a*(1/b). - """ - divide_node = a / b - divide_node.get_rt_info()["nonconvertable_divide_0"] = True - return divide_node diff --git a/tests/openvino/native/test_node_utils.py b/tests/openvino/native/test_node_utils.py index 241b9e6f156..dc09cda77e5 100644 --- a/tests/openvino/native/test_node_utils.py +++ b/tests/openvino/native/test_node_utils.py @@ -22,6 +22,7 @@ from nncf.openvino.graph.node_utils import get_weight_channel_axes from nncf.openvino.graph.node_utils import get_weighted_layer_attributes from nncf.openvino.graph.node_utils import is_node_with_bias +from nncf.openvino.graph.node_utils import non_convertable_divide from tests.openvino.native.models import ConvModel from tests.openvino.native.models import ConvNotBiasModel from tests.openvino.native.models import MatMul2DModel @@ -147,3 +148,21 @@ def test_get_weight_channel_axes_for_matmul(weights_port_id, transpose, shape, d assert len(actual_channel_axes) == len(expected_channel_axes) assert all(a == b for a, b in zip(actual_channel_axes, expected_channel_axes)) + + +@pytest.mark.parametrize( + "a,b,convertable,ref_result", + [ + (0.058599039912223816, 15, True, 0.003906603), + (0.058599039912223816, 15, False, 0.003906602505594492), + ], +) +def test_non_convertable_division(a, b, convertable, ref_result): + a, b, ref_result = tuple(map(lambda x: np.array([x], np.float32), [a, b, ref_result])) + a_param = opset.parameter((-1,), ov.Type.f32) + b_param = opset.parameter((-1,), ov.Type.f32) + division = (a_param / b_param) if convertable else non_convertable_divide(a_param, b_param) + model = ov.Model([division], [a_param, b_param]) + compiled_model = ov.compile_model(model, device_name="CPU") + actual_result = compiled_model([a, b])[0] + np.testing.assert_allclose(actual_result, ref_result, atol=0, rtol=0) From 9569e1e41cc8eefb274297cde3c7196d9de6a798 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 15:11:47 +0100 Subject: [PATCH 48/73] Make the test more strict --- .../native/quantization/test_ov_modeling_compression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py index 3e09714cae0..9a319aba742 100644 --- a/tests/openvino/native/quantization/test_ov_modeling_compression.py +++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py @@ -202,9 +202,9 @@ def test_quantization_alignment( assert scale.backend == TensorBackend.numpy if precompute_s_zp: # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones - np.testing.assert_allclose(precomputed_scale.data, scale.data) + np.testing.assert_allclose(precomputed_scale.data, scale.data, atol=0, rtol=0) if config.is_int_asym: - np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data) + np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0) if ( quantization_task == QuantizationTask.Q @@ -274,7 +274,7 @@ def test_quantization_alignment( # Check that the computed tensors are equal between implementations np.testing.assert_allclose( - numpy_result.data, ov_result.data, atol=atol, err_msg=f"Results do not align for {key}." + ov_result.data, numpy_result.data, atol=atol, rtol=0, err_msg=f"Results do not align for {key}." ) if max_misalignment_frequency is not None: From f962bd1ef26ed154cc2ca40898f0eb69abb0eff6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 15:23:06 +0100 Subject: [PATCH 49/73] Remove unnecessary lines --- .../algorithms/weight_compression/openvino_modeling.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 54bb083a711..c131161a945 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -364,10 +364,6 @@ def _build_compress_model( zero_point = convert_if_needed(zero_point, ov.Type.f32) elif is_int_asym: # Compute zero point - if min_values is None: - # [a1, r, a2] -> [a1, 1, a2] - min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True) - min_values = opset.convert(min_values, ov.Type.f32) scaled_min_values = non_convertable_divide(min_values, scale) zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values) zero_point = opset.clamp(zero_point, level_low, level_high) From 5dcd83df999f462608c15f24b6adcaf1dab5867a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 15:30:35 +0100 Subject: [PATCH 50/73] Update get_integer_quantization_error implementation --- .../algorithms/weight_compression/weight_lowering.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 263d457a3e3..9e80e1e95de 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -340,8 +340,7 @@ def get_integer_quantization_error( if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - compressed_weights, scale, zero_point = do_int_quantization(weight, config, reduction_axes) - decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point) + decompressed_weight = calculate_quantized_dequantized_weight(weight, config, reduction_axes) decompressed_weight = decompressed_weight.reshape(orig_shape) diff = (decompressed_weight - weight) ** 2 From 6e22ef5e7af9ab2720a71ff4755a09bcd3090efb Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 15:44:02 +0100 Subject: [PATCH 51/73] Remove unnecessary convert --- .../algorithms/weight_compression/openvino_backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 49e842f72d5..4a944553abe 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -243,7 +243,6 @@ def _create_compression_subgraph( compressed_const = self._create_ov_const_from_tensor( compressed_weight.tensor, compression_dtype, name=const_node_name ) - compressed_const = convert_if_needed(compressed_const, compression_dtype) converted_const = opset.convert(compressed_const, ov.Type.f16) if compressed_weight.zero_point is not None: From b45e7889a0f6e16a7a6c836e1571e884710f30d3 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 15:46:53 +0100 Subject: [PATCH 52/73] Move create_ov_const_from_tensor to node_utils --- nncf/openvino/graph/node_utils.py | 19 ++++++++++++++ .../weight_compression/openvino_backend.py | 26 +++---------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index a34f3c9d785..80432c7fc7e 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -14,6 +14,7 @@ import numpy as np import openvino.runtime as ov import openvino.runtime.opset13 as opset +from openvino._pyopenvino.op import Constant import nncf from nncf.common.graph.graph import NNCFGraph @@ -41,6 +42,8 @@ from nncf.openvino.graph.metatypes.openvino_metatypes import OVMatMulMetatype from nncf.openvino.graph.metatypes.openvino_metatypes import OVOpMetatype from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype +from nncf.tensor import Tensor +from nncf.tensor import TensorBackend InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node] @@ -654,3 +657,19 @@ def non_convertable_divide(a: ov.Node, b: ov.Node) -> ov.Node: divide_node = a / b divide_node.get_rt_info()["nonconvertable_divide_0"] = True return divide_node + + +def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant: + """ + Create an OpenVINO Constant node from the given tensor. + :param x: Data tensor. Supports NumPy and OV tensor backends. If x backend is OV, the constant node is created + directly from underlying OV tensor. + :param dtype: Data type of the constant. + :param name: Optional name of the constant. + :return: OpenVINO Constant node. + """ + if x.backend == TensorBackend.ov: + assert x.data.get_element_type() == dtype + return opset.constant(x.data, name=name) + const = opset.constant(x.data, dtype=dtype, name=name) + return const diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 4a944553abe..0f26b1a800b 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -12,7 +12,6 @@ import openvino as ov from openvino.runtime import opset13 as opset -from openvino.runtime.op import Constant import nncf from nncf.common.graph import NNCFGraph @@ -31,6 +30,7 @@ from nncf.openvino.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS from nncf.openvino.graph.model_transformer import OVModelTransformer from nncf.openvino.graph.node_utils import convert_if_needed +from nncf.openvino.graph.node_utils import create_ov_const_from_tensor from nncf.openvino.graph.node_utils import get_const_value from nncf.openvino.graph.node_utils import get_weight_channel_axes from nncf.openvino.graph.transformations.command_creation import OVCommandCreator @@ -240,13 +240,13 @@ def _create_compression_subgraph( original_shape = weight.shape compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points) - compressed_const = self._create_ov_const_from_tensor( + compressed_const = create_ov_const_from_tensor( compressed_weight.tensor, compression_dtype, name=const_node_name ) converted_const = opset.convert(compressed_const, ov.Type.f16) if compressed_weight.zero_point is not None: - zero_point_const = self._create_ov_const_from_tensor( + zero_point_const = create_ov_const_from_tensor( compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point" ) zero_point_const = opset.convert(zero_point_const, ov.Type.f16) @@ -254,9 +254,7 @@ def _create_compression_subgraph( converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract" ) - scale_const = self._create_ov_const_from_tensor( - compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale" - ) + scale_const = create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale") scale_const = convert_if_needed(scale_const, ov.Type.f16) mul = opset.multiply( @@ -347,22 +345,6 @@ def dump_parameters( ) -> None: dump_parameters(model, parameters, algo_name, path) - @staticmethod - def _create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant: - """ - Create an OpenVINO Constant node from the given tensor. - :param x: Data tensor. Supports NumPy and OV tensor backends. If x backend is OV, the constant node is created - directly from underlying OV tensor. - :param dtype: Data type of the constant. - :param name: Optional name of the constant. - :return: OpenVINO Constant node. - """ - if x.backend == TensorBackend.ov: - assert x.data.get_element_type() == dtype - return opset.constant(x.data, name=name) - const = opset.constant(x.data, dtype=dtype, name=name) - return const - class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): @staticmethod From b2cebd0d7a52c70a5b5ffd490c899c0e1d3087fa Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 15:57:03 +0100 Subject: [PATCH 53/73] Separate checking logic into standalone methods --- .../test_ov_modeling_compression.py | 103 +++++++++++------- 1 file changed, 66 insertions(+), 37 deletions(-) diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py index 9a319aba742..3d7f9d3b4c1 100644 --- a/tests/openvino/native/quantization/test_ov_modeling_compression.py +++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py @@ -196,42 +196,11 @@ def test_quantization_alignment( else: mock.assert_called_once() - if quantization_task != QuantizationTask.Q_DQ: - # Scale should always be float32 and numpy backend - assert scale.dtype == TensorDataType.float32 - assert scale.backend == TensorBackend.numpy - if precompute_s_zp: - # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones - np.testing.assert_allclose(precomputed_scale.data, scale.data, atol=0, rtol=0) - if config.is_int_asym: - np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0) - - if ( - quantization_task == QuantizationTask.Q - and cb == ComputationBackend.OV - and weight_tensor_backend == TensorBackend.ov - and config.num_bits == 4 - ): - # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed - # zero point must be in ov backend and have (u)int4 dtype in order to be able to insert them into OV model - # without re-packing - assert compressed_weight.backend == TensorBackend.ov - assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4) - if config.is_int_asym and not precompute_s_zp: - assert zero_point.backend == TensorBackend.ov - assert zero_point.dtype == TensorDataType.uint4 - else: - if quantization_task != QuantizationTask.Q_DQ: - # Otherwise compressed weight and zero point must be returned in numpy backend, compressed weight must - # be of (u)int8 data type, zero point -- in int32 - assert compressed_weight.backend == TensorBackend.numpy - assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8) - if config.is_int_asym and not precompute_s_zp: - assert zero_point.backend == TensorBackend.numpy - assert zero_point.dtype == TensorDataType.int32 - if quantization_task != QuantizationTask.Q: - assert decompressed_weight.backend == TensorBackend.numpy - assert decompressed_weight.dtype == TensorDataType.float32 + if quantization_task != QuantizationTask.Q_DQ and precompute_s_zp: + # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones + np.testing.assert_allclose(precomputed_scale.data, scale.data, atol=0, rtol=0) + if config.is_int_asym: + np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0) # Save results for comparison between implementations if quantization_task != QuantizationTask.Q: @@ -242,6 +211,66 @@ def test_quantization_alignment( if config.is_int_asym: results[cb]["zero_point"] = zero_point.to_backend(TensorBackend.numpy) + _check_backends_and_dtypes( + quantization_task, + cb, + weight_tensor_backend, + config, + precompute_s_zp, + compressed_weight, + scale, + zero_point, + decompressed_weight, + ) + + _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape) + + +def _check_backends_and_dtypes( + quantization_task, + cb, + weight_tensor_backend, + config, + precompute_s_zp, + compressed_weight, + scale, + zero_point, + decompressed_weight, +): + if quantization_task != QuantizationTask.Q_DQ: + # Scale should always be float32 and numpy backend + assert scale.dtype == TensorDataType.float32 + assert scale.backend == TensorBackend.numpy + + if ( + quantization_task == QuantizationTask.Q + and cb == ComputationBackend.OV + and weight_tensor_backend == TensorBackend.ov + and config.num_bits == 4 + ): + # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed + # zero point must be in ov backend and have (u)int4 dtype in order to be able to insert them into OV model + # without re-packing + assert compressed_weight.backend == TensorBackend.ov + assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4) + if config.is_int_asym and not precompute_s_zp: + assert zero_point.backend == TensorBackend.ov + assert zero_point.dtype == TensorDataType.uint4 + else: + if quantization_task != QuantizationTask.Q_DQ: + # Otherwise compressed weight and zero point must be returned in numpy backend, compressed weight must + # be of (u)int8 data type, zero point -- in int32 + assert compressed_weight.backend == TensorBackend.numpy + assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8) + if config.is_int_asym and not precompute_s_zp: + assert zero_point.backend == TensorBackend.numpy + assert zero_point.dtype == TensorDataType.int32 + if quantization_task != QuantizationTask.Q: + assert decompressed_weight.backend == TensorBackend.numpy + assert decompressed_weight.dtype == TensorDataType.float32 + + +def _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape): keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy])) for key in keys: numpy_result = results[ComputationBackend.NumPy][key] @@ -250,7 +279,7 @@ def test_quantization_alignment( atol = 0 scale = None # For static-shaped OV models doing asymmetric compression there maybe misalignments between OV and NumPy - # For more details see 156511 + # For more details see ticket 156511 if static_shapes and config.is_int_asym: if key == "compressed_weight": atol = MAX_MISALIGNMENT_MAGNITUDE From 3a7114121e20c0c129d674082cc4f47ea5ed972b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 17:31:30 +0100 Subject: [PATCH 54/73] Add debug conditions --- .../algorithms/weight_compression/openvino_backend.py | 5 +++-- .../algorithms/weight_compression/weight_lowering.py | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 0f26b1a800b..018ff64985c 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -285,7 +285,8 @@ def transform_model( const_node = self.name_to_node_mapping[const_node_name] const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() - weight = get_const_value(const_node, cast_bf16_to_fp32=False) + import os + weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))) # Creation of ov.Tensor is required for two reasons: # 1. To be able to process BF16 weight properly # 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed @@ -335,7 +336,7 @@ def transform_model( self.name_to_node_mapping = None # clear openvino model cache - OV_MODEL_CACHE.clear() + # OV_MODEL_CACHE.clear() return model diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 9e80e1e95de..99f3053fbe1 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -449,7 +449,8 @@ def do_int_quantization( "for asymmetric quantization." ) - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch + import os + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Running time may be improved after installing OpenVINO") @@ -499,6 +500,11 @@ def do_int_quantization( {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype} ) + ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) + ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) + ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) + ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) + model = get_compress_weight_model( ov_model_params, config, From eeadf1d1b856b3e3039aeff2a5f62f8b952e8717 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 12 Dec 2024 10:40:48 +0100 Subject: [PATCH 55/73] Move ov model cache clearing to ov backend destructor --- .../algorithms/weight_compression/openvino_backend.py | 8 ++++---- .../algorithms/weight_compression/openvino_modeling.py | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 018ff64985c..1e302dcfa6a 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -49,7 +49,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE +from nncf.quantization.algorithms.weight_compression.openvino_modeling import clear_ov_model_cache from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorBackend @@ -335,9 +335,6 @@ def transform_model( # reset name_to_node_mapping self.name_to_node_mapping = None - # clear openvino model cache - # OV_MODEL_CACHE.clear() - return model @staticmethod @@ -346,6 +343,9 @@ def dump_parameters( ) -> None: dump_parameters(model, parameters, algo_name, path) + def __del__(self): + clear_ov_model_cache() + class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): @staticmethod diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index c131161a945..fa08fd5f7b9 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -116,6 +116,10 @@ def __hash__(self): ModelAsNodes = Tuple[List[Parameter], List[Node], OVModelParameters] +def clear_ov_model_cache(): + OV_MODEL_CACHE.clear() + + def _infer_ov_model( ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList ) -> TensorList: From 40aef547f5384f26dfbb534893bda41a7c10ea2b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 12 Dec 2024 10:52:06 +0100 Subject: [PATCH 56/73] Update default ov model parameters --- .../weight_compression/openvino_backend.py | 10 +++++++++- .../weight_compression/openvino_modeling.py | 2 +- .../weight_compression/weight_lowering.py | 4 +++- nncf/tensor/functions/ov.py | 17 ++++------------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 1e302dcfa6a..5272fcf166f 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -49,6 +49,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters from nncf.quantization.algorithms.weight_compression.openvino_modeling import clear_ov_model_cache from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor @@ -238,7 +239,14 @@ def _create_compression_subgraph( raise nncf.ParameterNotSupportedError(f"{compression_config.mode.value} is not supported.") original_shape = weight.shape - compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points) + compressed_weight = compress_weight( + weight, + reduction_axes, + compression_config, + layer_scales, + layer_zero_points, + OVModelParameters(recompile=True, release_memory=False), + ) compressed_const = create_ov_const_from_tensor( compressed_weight.tensor, compression_dtype, name=const_node_name diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index fa08fd5f7b9..e90d1716f49 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -46,7 +46,7 @@ def __init__( self, input_dtypes: Optional[Dict[str, TensorDataType]] = None, output_dtypes: Optional[Dict[str, TensorDataType]] = None, - dynamic_shapes: bool = False, + dynamic_shapes: bool = True, recompile: bool = False, release_memory: bool = True, share_inputs: bool = True, diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 99f3053fbe1..999888462c3 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -355,6 +355,7 @@ def compress_weight( config: WeightCompressionConfig, precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, + ov_model_params: Optional = None, ): """ Compress weight using compression configuration. @@ -364,6 +365,7 @@ def compress_weight( :param config: Compression configuration. :param precomputed_scale: Precomputed scale. :param precomputed_zero_point: Precomputed zero point. + :param ov_model_params: OpenVINO model parameters for acceleration. :return: The compressed weight and decompression parameters as instance of CompressedWeight """ if not config.is_integer: @@ -375,7 +377,7 @@ def compress_weight( ) return CompressedWeight(compressed_weight, scale) compressed_weight, scale, zero_point = do_int_quantization( - weight, config, reduction_axes, precomputed_scale, precomputed_zero_point + weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, ov_model_params ) return CompressedWeight(compressed_weight, scale, zero_point) diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index a316d76ac43..9a2d43d79d3 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -116,17 +116,8 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: a_dtype = DTYPE_MAP_REV[a.get_element_type()] - model = get_astype_model( - OVModelParameters( - input_dtypes={"input": a_dtype}, - output_dtypes={"output": dtype}, - dynamic_shapes=False, - recompile=True, - release_memory=True, - share_inputs=True, - share_outputs=True, - return_ov_tensors=True, - ), - tuple(a.shape), - ) + ov_model_params = OVModelParameters(recompile=True, release_memory=False, return_ov_tensors=True) + ov_model_params.input_dtypes = {"input": a_dtype} + ov_model_params.output_dtypes = {"output": dtype} + model = get_astype_model(ov_model_params, tuple(a.shape)) return model([Tensor(a)])[0].data From ab3d35f0dc5a5a174c46f7ef63ac7948c7ca4c62 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 12 Dec 2024 11:04:55 +0100 Subject: [PATCH 57/73] Revert debug logic --- .../algorithms/weight_compression/openvino_backend.py | 3 +-- .../algorithms/weight_compression/weight_lowering.py | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 5272fcf166f..0eaa6b72532 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -293,8 +293,7 @@ def transform_model( const_node = self.name_to_node_mapping[const_node_name] const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() - import os - weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))) + weight = get_const_value(const_node, cast_bf16_to_fp32=False) # Creation of ov.Tensor is required for two reasons: # 1. To be able to process BF16 weight properly # 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 999888462c3..9de76e5ce71 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -451,8 +451,7 @@ def do_int_quantization( "for asymmetric quantization." ) - import os - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Running time may be improved after installing OpenVINO") @@ -502,11 +501,6 @@ def do_int_quantization( {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype} ) - ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) - ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) - ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) - ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) - model = get_compress_weight_model( ov_model_params, config, From d48c748bcb2f53d481d20751dc4693c581c916f8 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 12 Dec 2024 15:59:51 +0100 Subject: [PATCH 58/73] Update reference --- tests/post_training/data/wc_reference_data.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 6c48904c91a..683dc62f401 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -23,7 +23,7 @@ tinyllama_int8_data_free_backend_TORCH: num_int4: 0 num_int8: 312 tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV: - metric_value: 0.86503 + metric_value: 0.88669 num_int4: 94 num_int8: 124 metrics_xfail_reason: "Issue-148819" From 9a56fae2692fc22bb6e25f74a1c5f00dfc078e86 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 11 Dec 2024 17:31:30 +0100 Subject: [PATCH 59/73] Add debug conditions --- .../algorithms/weight_compression/openvino_backend.py | 3 ++- .../algorithms/weight_compression/weight_lowering.py | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 0eaa6b72532..5272fcf166f 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -293,7 +293,8 @@ def transform_model( const_node = self.name_to_node_mapping[const_node_name] const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() - weight = get_const_value(const_node, cast_bf16_to_fp32=False) + import os + weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))) # Creation of ov.Tensor is required for two reasons: # 1. To be able to process BF16 weight properly # 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 9de76e5ce71..999888462c3 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -451,7 +451,8 @@ def do_int_quantization( "for asymmetric quantization." ) - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch + import os + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Running time may be improved after installing OpenVINO") @@ -501,6 +502,11 @@ def do_int_quantization( {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype} ) + ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) + ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) + ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) + ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) + model = get_compress_weight_model( ov_model_params, config, From e10d806de2f20200f8bcddbf94220aadbfa1aba3 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 12 Dec 2024 20:15:16 +0100 Subject: [PATCH 60/73] Disable dynamic shapes by default --- .../algorithms/weight_compression/openvino_backend.py | 2 +- .../algorithms/weight_compression/openvino_modeling.py | 2 +- nncf/tensor/functions/ov.py | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 5272fcf166f..2cbb0904706 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -245,7 +245,7 @@ def _create_compression_subgraph( compression_config, layer_scales, layer_zero_points, - OVModelParameters(recompile=True, release_memory=False), + OVModelParameters(dynamic_shapes=False, recompile=True, release_memory=False), ) compressed_const = create_ov_const_from_tensor( diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index e90d1716f49..fa08fd5f7b9 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -46,7 +46,7 @@ def __init__( self, input_dtypes: Optional[Dict[str, TensorDataType]] = None, output_dtypes: Optional[Dict[str, TensorDataType]] = None, - dynamic_shapes: bool = True, + dynamic_shapes: bool = False, recompile: bool = False, release_memory: bool = True, share_inputs: bool = True, diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index 9a2d43d79d3..b7eda808447 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -116,7 +116,12 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: a_dtype = DTYPE_MAP_REV[a.get_element_type()] - ov_model_params = OVModelParameters(recompile=True, release_memory=False, return_ov_tensors=True) + ov_model_params = OVModelParameters( + dynamic_shapes=True, + recompile=True, + release_memory=False, + return_ov_tensors=True + ) ov_model_params.input_dtypes = {"input": a_dtype} ov_model_params.output_dtypes = {"output": dtype} model = get_astype_model(ov_model_params, tuple(a.shape)) From b372dc70543a377298f5956503713b5271980360 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 12 Dec 2024 20:40:10 +0100 Subject: [PATCH 61/73] Revert "Add debug conditions" This reverts commit 9a56fae2692fc22bb6e25f74a1c5f00dfc078e86. --- .../algorithms/weight_compression/openvino_backend.py | 3 +-- .../algorithms/weight_compression/weight_lowering.py | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 2cbb0904706..1bb5ea1adcd 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -293,8 +293,7 @@ def transform_model( const_node = self.name_to_node_mapping[const_node_name] const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() - import os - weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))) + weight = get_const_value(const_node, cast_bf16_to_fp32=False) # Creation of ov.Tensor is required for two reasons: # 1. To be able to process BF16 weight properly # 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 999888462c3..9de76e5ce71 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -451,8 +451,7 @@ def do_int_quantization( "for asymmetric quantization." ) - import os - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Running time may be improved after installing OpenVINO") @@ -502,11 +501,6 @@ def do_int_quantization( {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype} ) - ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) - ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0"))) - ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0"))) - ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) - model = get_compress_weight_model( ov_model_params, config, From 63858d3958929bff306c90d62197389f019f0dba Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 12 Dec 2024 21:14:04 +0100 Subject: [PATCH 62/73] Linters --- nncf/tensor/functions/ov.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index b7eda808447..f277da53d28 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -117,10 +117,7 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: a_dtype = DTYPE_MAP_REV[a.get_element_type()] ov_model_params = OVModelParameters( - dynamic_shapes=True, - recompile=True, - release_memory=False, - return_ov_tensors=True + dynamic_shapes=True, recompile=True, release_memory=False, return_ov_tensors=True ) ov_model_params.input_dtypes = {"input": a_dtype} ov_model_params.output_dtypes = {"output": dtype} From 87b5c1069d292bd9c0e170184e3b56655c42f2b2 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 13 Dec 2024 10:47:11 +0100 Subject: [PATCH 63/73] Fix lora correction --- .../algorithms/weight_compression/weight_lowering.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 9de76e5ce71..227ac16342c 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -415,7 +415,9 @@ def do_int_dequantization( original shapes. If equals to -1: weights are not reshaped, assumed not a group quantization. Default to -1. :return: dequantized/decompressed weights. """ - decompressed_weight = compressed_weights - zero_point if zero_point is not None else compressed_weights + decompressed_weight = ( + compressed_weights.astype(TensorDataType.int32) - zero_point if zero_point is not None else compressed_weights + ) decompressed_weight = decompressed_weight.astype(scale.dtype) * scale if reduction_axis > -1: From 7134e6d43bfd3ea9b927cbf9ced3d2ee2692a86d Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 13 Dec 2024 13:46:05 +0100 Subject: [PATCH 64/73] Remove not used argument --- nncf/quantization/algorithms/weight_compression/gptq.py | 1 - .../algorithms/weight_compression/scale_estimation.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index d2178b19e91..4a5686a0ef8 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -267,7 +267,6 @@ def _quantize_weights( activations = [inp[..., (i1 + i) : (i1 + i + group_size)] for inp in inputs] wc_statistics = ScaleEstimation.activations_to_wc_statistics(activations) scale, zero_point = ScaleEstimation.calculate_quantization_params( - self._backend_entity, wc_statistics, weight_tensor[:, (i1 + i) : (i1 + i + group_size)], reduction_axes, diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index f7eff80c321..af51182a586 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -23,7 +23,6 @@ from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats -from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale @@ -146,7 +145,6 @@ def apply( weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params( - self._backend_entity, stats, weight, wp.reduction_axes, @@ -161,7 +159,6 @@ def apply( @staticmethod def calculate_quantization_params( - backend_entity: WeightCompressionAlgoBackend, statistics: WCTensorStatistic, weight: Tensor, reduction_axes: Tuple[int, ...], @@ -181,7 +178,6 @@ def calculate_quantization_params( 1. Initial scale rectification based on activation statistics. 2. A grid search to further refine the scale parameters. - :param backend_entity: The backend-specific implementation of the weight compression algorithm. :param statistics: The input activations of the layer reduced over batch and sequence length dimensions, together with original activation tensor shapes. :param weight: The weight tensor that is being quantized. From 5a1866f5506aef5aa7eae251a9b44e2dbbc26032 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 13 Dec 2024 14:53:13 +0100 Subject: [PATCH 65/73] Remove static shapes testing because it is not needed with non-convertable division --- .../test_ov_modeling_compression.py | 77 +++---------------- 1 file changed, 10 insertions(+), 67 deletions(-) diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py index 3d7f9d3b4c1..6919c23b3a0 100644 --- a/tests/openvino/native/quantization/test_ov_modeling_compression.py +++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py @@ -31,7 +31,6 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization from nncf.tensor import Tensor from nncf.tensor import TensorDataType -from nncf.tensor import functions as fns from nncf.tensor.definitions import TensorBackend from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP from nncf.tensor.functions.numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP @@ -58,16 +57,6 @@ class QuantizationTask(Enum): WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2), ] -MAX_MISALIGNMENT_FREQUENCY = { - TensorDataType.float32: 1e-2, # tends to < 5e-6 - TensorDataType.float16: 1e-2, # tends to < 5e-5 - TensorDataType.bfloat16: 1e-2, # tends to < 5e-4 -} - -MAX_MISALIGNMENT_MAGNITUDE = 1 - -EPS = np.finfo(np.float32).eps - REDUCTION_AXES = (1,) RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer() @@ -107,7 +96,7 @@ def openvino_available(available: bool): nncf.import_utils._openvino_available = original_value -@pytest.mark.parametrize("weight_shape", [(10000, 4)], ids=[""]) +@pytest.mark.parametrize("weight_shape", [(100000, 4)], ids=[""]) @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) @pytest.mark.parametrize( ("quantization_task", "tensor_backend"), @@ -124,10 +113,7 @@ def openvino_available(available: bool): ) @pytest.mark.parametrize("dtype", [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]) @pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"]) -@pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"]) -def test_quantization_alignment( - weight_shape, config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes -): +def test_quantization_alignment(weight_shape, config, quantization_task, tensor_backend, dtype, precompute_s_zp): d1, d2 = weight_shape group_size = config.group_size zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1) @@ -174,7 +160,7 @@ def test_quantization_alignment( kwargs = {} if cb == ComputationBackend.OV: - ov_model_params = OVModelParameters(dynamic_shapes=not static_shapes) + ov_model_params = OVModelParameters() kwargs["ov_model_params"] = ov_model_params if quantization_task == QuantizationTask.Q_DQ_RQ: kwargs["return_compressed_weight"] = True @@ -223,7 +209,7 @@ def test_quantization_alignment( decompressed_weight, ) - _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape) + _check_values(results) def _check_backends_and_dtypes( @@ -270,59 +256,16 @@ def _check_backends_and_dtypes( assert decompressed_weight.dtype == TensorDataType.float32 -def _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape): +def _check_values(results): + # Check that the computed tensors are equal between implementations keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy])) for key in keys: numpy_result = results[ComputationBackend.NumPy][key] ov_result = results[ComputationBackend.OV][key] - atol = 0 - scale = None - # For static-shaped OV models doing asymmetric compression there maybe misalignments between OV and NumPy - # For more details see ticket 156511 - if static_shapes and config.is_int_asym: - if key == "compressed_weight": - atol = MAX_MISALIGNMENT_MAGNITUDE - elif key == "decompressed_weight": - if "scale" in results[ComputationBackend.NumPy]: - scale = results[ComputationBackend.NumPy]["scale"] - else: - if precompute_s_zp: - scale = precomputed_scale - else: - weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy) - with openvino_available(False): - _, _, scale, _ = calculate_quantized_dequantized_weight( - weight, config, REDUCTION_AXES, return_compressed_weight=True - ) - # For decompressed weight the misalignment magnitude depends on the scale - atol = MAX_MISALIGNMENT_MAGNITUDE * fns.abs(scale).max().item() + EPS - max_misalignment_frequency = MAX_MISALIGNMENT_FREQUENCY[dtype] - else: - max_misalignment_frequency = None - - # Check that the computed tensors are equal between implementations + # Note: For static-shaped OV models doing asymmetric compression with convertable divisions there maybe + # misalignments equal to 1 quant between OV and NumPy. For more details see ticket 156511. + np.testing.assert_allclose( - ov_result.data, numpy_result.data, atol=atol, rtol=0, err_msg=f"Results do not align for {key}." + ov_result.data, numpy_result.data, atol=0, rtol=0, err_msg=f"Results do not align for {key}." ) - - if max_misalignment_frequency is not None: - if key == "compressed_weight": - diff = fns.abs(numpy_result.astype(TensorDataType.int32) - ov_result.astype(TensorDataType.int32)) - else: - diff = fns.abs(numpy_result - ov_result) - - if diff.max() > 0: - # Check that the proportion of misaligned values is small - n_not_equal = fns.sum(diff > 0) - assert n_not_equal / numpy_result.size < max_misalignment_frequency - - # Check that the magnitude of misalignment is as small as expected - if key == "decompressed_weight": - # Reshape scale to match the shape of decompressed weight - scale = np.repeat(scale.data, diff.shape[-1], axis=-1) - np.testing.assert_array_less( - diff.data, - MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS, - err_msg=f"Too large misalignment for {key}.", - ) From 6a2c9fc928bb4862cd2c16d56a9742f6b5e5042e Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 13 Dec 2024 14:53:25 +0100 Subject: [PATCH 66/73] Set dynamic shapes by default --- .../algorithms/weight_compression/openvino_backend.py | 2 +- .../algorithms/weight_compression/openvino_modeling.py | 2 +- nncf/tensor/functions/ov.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 1bb5ea1adcd..0eaa6b72532 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -245,7 +245,7 @@ def _create_compression_subgraph( compression_config, layer_scales, layer_zero_points, - OVModelParameters(dynamic_shapes=False, recompile=True, release_memory=False), + OVModelParameters(recompile=True, release_memory=False), ) compressed_const = create_ov_const_from_tensor( diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index fa08fd5f7b9..e90d1716f49 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -46,7 +46,7 @@ def __init__( self, input_dtypes: Optional[Dict[str, TensorDataType]] = None, output_dtypes: Optional[Dict[str, TensorDataType]] = None, - dynamic_shapes: bool = False, + dynamic_shapes: bool = True, recompile: bool = False, release_memory: bool = True, share_inputs: bool = True, diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py index f277da53d28..9a2d43d79d3 100644 --- a/nncf/tensor/functions/ov.py +++ b/nncf/tensor/functions/ov.py @@ -116,9 +116,7 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: a_dtype = DTYPE_MAP_REV[a.get_element_type()] - ov_model_params = OVModelParameters( - dynamic_shapes=True, recompile=True, release_memory=False, return_ov_tensors=True - ) + ov_model_params = OVModelParameters(recompile=True, release_memory=False, return_ov_tensors=True) ov_model_params.input_dtypes = {"input": a_dtype} ov_model_params.output_dtypes = {"output": dtype} model = get_astype_model(ov_model_params, tuple(a.shape)) From 92fbba57394b7dd5aebdce84698a5d05e5e2d355 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 12:04:23 +0100 Subject: [PATCH 67/73] Guarantee call order --- .../utils/test_cache_results_decorator.py | 221 +++++++++--------- 1 file changed, 110 insertions(+), 111 deletions(-) diff --git a/tests/common/utils/test_cache_results_decorator.py b/tests/common/utils/test_cache_results_decorator.py index 599e41a421d..1a6e3e107c8 100644 --- a/tests/common/utils/test_cache_results_decorator.py +++ b/tests/common/utils/test_cache_results_decorator.py @@ -8,7 +8,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import pytest from nncf.common.utils.decorators import ResultsCacheContainer from nncf.common.utils.decorators import cache_results @@ -21,113 +20,113 @@ def cached_addition(a, b): return a + b -@pytest.mark.parametrize( - "inputs,disable_caching,output,clear_cache,cache_size,ref_cache,ref_access_count", - [ - ( - (1, 2), - False, - 3, - False, - 1, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0}, - ), - ( - (1, 2), - False, - 3, - False, - 1, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1}, - ), - ( - (2, 3), - True, - 5, - False, - 1, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1}, - ), - ( - (3, 4), - False, - 7, - False, - 2, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, - }, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 1, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0, - }, - ), - ( - (1, 2), - False, - 3, - False, - 2, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, - }, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0, - }, - ), - ( - (3, 4), - False, - 7, - False, - 2, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, - }, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1, - }, - ), - ( - (3, 4), - True, - 7, - False, - 2, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, - }, - { - ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, - ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1, - }, - ), - ((3, 4), True, 7, True, 0, {}, {}), - ( - (1, 2), - False, - 3, - False, - 1, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, - {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0}, - ), - ], -) -def test_caching_results(inputs, disable_caching, output, clear_cache, cache_size, ref_cache, ref_access_count): - if clear_cache: - TEST_CACHE_CONTAINER.clear() - kwargs = {"disable_caching": True} if disable_caching else {} - assert cached_addition(*inputs, **kwargs) == output - assert len(TEST_CACHE_CONTAINER._cache) == cache_size - assert TEST_CACHE_CONTAINER._cache == ref_cache - assert TEST_CACHE_CONTAINER._access_count == ref_access_count +CALL_SEQUENCE = [ + ( + (1, 2), + False, + 3, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0}, + ), + ( + (1, 2), + False, + 3, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1}, + ), + ( + (2, 3), + True, + 5, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1}, + ), + ( + (3, 4), + False, + 7, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 1, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0, + }, + ), + ( + (1, 2), + False, + 3, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0, + }, + ), + ( + (3, 4), + False, + 7, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1, + }, + ), + ( + (3, 4), + True, + 7, + False, + 2, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7, + }, + { + ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2, + ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1, + }, + ), + ((3, 4), True, 7, True, 0, {}, {}), + ( + (1, 2), + False, + 3, + False, + 1, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3}, + {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0}, + ), +] + + +def test_caching_results(): + for inputs, disable_caching, output, clear_cache, cache_size, ref_cache, ref_access_count in CALL_SEQUENCE: + if clear_cache: + TEST_CACHE_CONTAINER.clear() + kwargs = {"disable_caching": True} if disable_caching else {} + assert cached_addition(*inputs, **kwargs) == output + assert len(TEST_CACHE_CONTAINER._cache) == cache_size + assert TEST_CACHE_CONTAINER._cache == ref_cache + assert TEST_CACHE_CONTAINER._access_count == ref_access_count From b27c720e4e98d3ed67cd311c4a96901b47f3a68c Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 17:02:07 +0100 Subject: [PATCH 68/73] Add convertable_division parameter --- .../weight_compression/openvino_backend.py | 58 +++ .../weight_compression/openvino_modeling.py | 18 +- .../weight_compression/scale_estimation.py | 98 +++- .../scale_estimation_old.py | 424 ++++++++++++++++++ .../weight_compression/weight_lowering.py | 6 +- 5 files changed, 592 insertions(+), 12 deletions(-) create mode 100644 nncf/quantization/algorithms/weight_compression/scale_estimation_old.py diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 0eaa6b72532..bfe00223755 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -352,6 +352,64 @@ def dump_parameters( def __del__(self): clear_ov_model_cache() + @staticmethod + def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None): + from openvino.properties.hint import inference_precision + import openvino as ov + + parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline( + config, w_shape, s_shape, z_p_shape, True + ) + + if len(parameters) == 3: + _, s, zp = parameters + result = (clamp - zp) * s + else: + s = parameters[1] + result = clamp * s + + model = ov.Model([result], parameters) + + compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32}) + + return lambda parameters: compiled_model(parameters)[0] + + @staticmethod + def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, + return_nodes=False): + from openvino.properties.hint import inference_precision + import openvino as ov + + mode = config.mode + assert mode in [ + CompressWeightsMode.INT4_SYM, + CompressWeightsMode.INT4_ASYM, + ], f"Only int4 supported, but given={mode}" + num_bits = config.num_bits + + asym_quant = mode in [CompressWeightsMode.INT4_ASYM] + level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) + level_high = 2 ** num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 + + w = opset.parameter(w_shape, name="w") + s = opset.parameter(s_shape, name="s") + parameters = [w, s] + compressed_w = w / s + if z_p_shape is not None: + zp = opset.parameter(z_p_shape, name="zp") + parameters.append(zp) + compressed_w += zp + + result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") + + if return_nodes: + return parameters, result + + model = ov.Model([result], parameters) + + compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32}) + + return lambda parameters: compiled_model(parameters)[0] class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index e90d1716f49..5092eb61978 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -10,6 +10,7 @@ # limitations under the License. import copy +import os from dataclasses import dataclass from functools import partial from typing import Callable, Dict, List, Optional, Tuple, Union @@ -52,6 +53,7 @@ def __init__( share_inputs: bool = True, share_outputs: bool = True, return_ov_tensors: bool = False, + convertable_division: bool = False, ): """ :param input_dtypes: Optional dictionary mapping input names to their data types. @@ -64,6 +66,8 @@ def __init__( :param share_inputs: Whether to share input tensors. Avoids cloning inputs for inference. :param share_outputs: Whether to share output tensors. Avoids cloning outputs after the inference. :param return_ov_tensors: Whether to return results as OpenVINO tensors or NumPy arrays. + :param convertable_division: Whether to use convertable division for division operations. If True, division a/b + will be transformed at runtime to a*(1/b). """ self.input_dtypes = input_dtypes or {} self.output_dtypes = output_dtypes or {} @@ -73,6 +77,7 @@ def __init__( self.share_inputs = share_inputs self.share_outputs = share_outputs self.return_ov_tensors = return_ov_tensors + self.convertable_division = convertable_division def __copy__(self): return OVModelParameters( @@ -84,6 +89,7 @@ def __copy__(self): share_inputs=self.share_inputs, share_outputs=self.share_outputs, return_ov_tensors=self.return_ov_tensors, + convertable_division=self.convertable_division, ) def __deepcopy__(self, memo): @@ -96,6 +102,7 @@ def __deepcopy__(self, memo): share_inputs=self.share_inputs, share_outputs=self.share_outputs, return_ov_tensors=self.return_ov_tensors, + convertable_division=self.convertable_division, ) def __hash__(self): @@ -109,6 +116,7 @@ def __hash__(self): self.share_inputs, self.share_outputs, self.return_ov_tensors, + self.convertable_division, ) ) @@ -334,6 +342,8 @@ def _build_compress_model( level_low = 0 if is_int_asym else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if is_int_asym else 2 ** (num_bits - 1) - 1 + divide_op = opset.divide if ov_model_params.convertable_division else non_convertable_divide + min_values = None if scale_shape is not None: # Scale is given as an input @@ -348,7 +358,7 @@ def _build_compress_model( min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) levels = level_high - level_low + 1 - scale = non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32)) + scale = divide_op(max_values - min_values, opset.constant(levels - 1, ov.Type.f32)) scale = opset.select(opset.abs(scale) < eps, eps, scale) else: w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) @@ -356,7 +366,7 @@ def _build_compress_model( w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max)) - scale = non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32)) + scale = divide_op(scale, opset.constant(-level_low, ov.Type.f32)) scale = opset.select(opset.abs(scale) < eps, eps, scale) zero_point = None @@ -368,12 +378,12 @@ def _build_compress_model( zero_point = convert_if_needed(zero_point, ov.Type.f32) elif is_int_asym: # Compute zero point - scaled_min_values = non_convertable_divide(min_values, scale) + scaled_min_values = divide_op(min_values, scale) zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values) zero_point = opset.clamp(zero_point, level_low, level_high) weight = convert_if_needed(weight, ov.Type.f32) - compressed_weight = non_convertable_divide(weight, scale) + compressed_weight = divide_op(weight, scale) if is_int_asym: compressed_weight += zero_point diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index af51182a586..948c04be951 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -8,10 +8,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple, TypeVar +import numpy as np + import nncf from nncf import Dataset from nncf.common.graph.graph import NNCFGraph @@ -25,7 +27,9 @@ from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale +from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale, \ + do_int_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization @@ -38,6 +42,61 @@ TModel = TypeVar("TModel") +def compare_tensors( + weight, + data_ref: Dict["str", Tensor], + data_actual: Dict["str", Tensor], + node_name: str, + rel_diff_threshold=5e-1, + verbosity=2 +): + hists = {} + stats = {} + for name in data_ref: + ref = data_ref[name] + actual = data_actual[name] + try: + np.testing.assert_allclose(actual.data, ref.data, atol=0, rtol=0) + except Exception as e: + not_equal = np.where(ref.data != actual.data) + diff = fns.abs(ref - actual).data[not_equal] + rel_diff = diff / fns.maximum(fns.abs(ref).data[not_equal], 1e-9) + stats[name] = (np.median(rel_diff), rel_diff.max(), len(not_equal[0]) / ref.size) + + is_fp32 = ref.dtype == TensorDataType.float32 + bins = np.logspace(-10, 2,) if is_fp32 else np.arange(17) + hists[name] = np.histogram(diff, bins=bins, density=False) + + if verbosity > 0: + print() + print(node_name, name) + print(str(e).replace("Not equal to tolerance rtol=1e-07, atol=0", "").strip()) + if verbosity > 1: + # format_str = "{:.2e}" + format_str = "{:.10f}" + zip_arg = ( + rel_diff.tolist(), + [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in ref.data[not_equal].tolist()], + [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in actual.data[not_equal].tolist()], + ) + if weight is not None: + # zip_arg += ([f"{it:.30f}" for it in weight.data[not_equal].tolist()],) + zip_arg += ([it for it in weight.data[not_equal].tolist()],) + data = list(zip(*zip_arg)) + data = list(filter(lambda it: it[0] > (rel_diff_threshold if is_fp32 else 1), data)) + if len(data) > 0: + data = sorted(data, key=lambda it: it[0], reverse=True) + data = list(zip(*data)) + print("Rel. diff:", [f"{it:.2e}" if is_fp32 else int(it) for it in data[0]][:100]) + print("Reference:", data[1][:100]) + print("Actual:", data[2][:100]) + if weight is not None: + print("Weight:", data[3][:100]) + if verbosity > 0: + print('-' * 50) + return hists, stats + + class ScaleEstimation: """ Scale estimation algorithm implementation. @@ -79,6 +138,9 @@ def __init__( self._set_backend_entity(model) + from nncf.quantization.algorithms.weight_compression.scale_estimation_old import ScaleEstimationOld + self.se_old = ScaleEstimationOld(model, name_to_node_mapping, all_weight_params, nodes_to_compress, statistics, subset_size, initial_steps, scale_steps, weight_penalty) + @property def available_backends(self) -> List[BackendType]: return [BackendType.OPENVINO] @@ -155,6 +217,24 @@ def apply( self._weight_penalty, ) + scale_, zero_point = self.se_old.calculate_quantization_params( + self._backend_entity, + stats, + weight, + wp.reduction_axes, + config, + self._subset_size, + self._initial_steps, + self._scale_steps, + self._weight_penalty, + ) + compare_tensors( + None, + {"scale": scale_, "zero_point": zero_point}, + {"scale": scales[weight_name], "zero_point": zero_points[weight_name]}, + node_name + ) + return scales, zero_points @staticmethod @@ -255,6 +335,9 @@ def calculate_quantization_params( zero_scale = 0.001 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + # This is required for alignment with a previous OpenVINO models implementation + ov_model_params = OVModelParameters(dynamic_shapes=False, convertable_division=True) + # iterative rectification of initial scale for i in range(initial_steps): near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) @@ -265,8 +348,10 @@ def calculate_quantization_params( out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: out = calculate_quantized_dequantized_weight( - original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp + original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, + ov_model_params=ov_model_params ) + q_weights_ = fns.zeros_like(original_weight) + out q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) @@ -295,7 +380,8 @@ def calculate_quantization_params( out = do_nf4_quantization(original_weight, near_to_ideal_scale) else: out, _, _ = do_int_quantization( - original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp + original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, + ov_model_params=ov_model_params ) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) @@ -310,7 +396,7 @@ def calculate_quantization_params( out = do_nf4_quantization(original_weight, scaled_scale) else: out, _, _ = do_int_quantization( - original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp + original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params, ) compressed_weights = fns.zeros_like(original_weight) + out @@ -324,7 +410,7 @@ def calculate_quantization_params( out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: out = calculate_quantized_dequantized_weight( - original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp + original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params ) q_weights_ = fns.zeros_like(original_weight) + out diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py b/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py new file mode 100644 index 00000000000..88455f5b651 --- /dev/null +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py @@ -0,0 +1,424 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from typing import Any, Dict, List, Optional, Tuple, TypeVar + +import nncf +from nncf import Dataset +from nncf.common.graph.graph import NNCFGraph +from nncf.common.graph.graph import NNCFNode +from nncf.common.logging.track_progress import track +from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer +from nncf.common.utils.backend import BackendType +from nncf.common.utils.backend import get_backend +from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic +from nncf.parameters import CompressWeightsMode +from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats +from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType +from nncf.tensor import functions as fns + +TModel = TypeVar("TModel") + + +class ScaleEstimationOld: + """ + Scale estimation algorithm implementation. + """ + + compress_decompress_cache = {} + + def __init__( + self, + model: TModel, + name_to_node_mapping: Dict[str, Any], + all_weight_params: List[WeightCompressionParameters], + nodes_to_compress: List[NNCFNode], + statistics: Dict[str, WCTensorStatistic], + subset_size: int = 32, + initial_steps: int = 5, + scale_steps: int = 10, + weight_penalty: float = -1.0, + ): + """ + :param model: Model for applying algorithm. + :param name_to_node_mapping: Name to node mapping for updating node weights. + :param all_weight_params: List of all weight parameters. + :param nodes_to_compress: List of nodes for processing. + :param statistics: Input activation statistics for each node. + :param subset_size: The number of samples for scale estimation. + :param initial_steps: The number of the steps for absmax scale rectification. + :param scale_steps: The number of the steps for grid search scale rectification + from 1.0 to 1.0 - 0.05 * scale_step. + :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply. + """ + super().__init__() + self.name_to_node_mapping = name_to_node_mapping + self._all_weight_params = all_weight_params + self._nodes_to_compress = nodes_to_compress + self._statistics = statistics + self._subset_size = subset_size + self._initial_steps = initial_steps + self._scale_steps = scale_steps + self._weight_penalty = weight_penalty + + self._set_backend_entity(model) + + @property + def available_backends(self) -> List[BackendType]: + return [BackendType.OPENVINO] + + def _set_backend_entity(self, model: TModel) -> None: + """ + Creates a helper class with a backed-specific logic of the algorithm. + + :param model: Backend-specific input model. + :param all_weight_params: List of all weight parameters. + :param nodes_to_compress: List of nodes for processing. + :param activations: The input activations of the layers considered for compression. + """ + + model_backend = get_backend(model) + if model_backend == BackendType.OPENVINO: + from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend + + self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping) + else: + raise nncf.UnsupportedBackendError( + "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value) + ) + + def apply( + self, + model: TModel, + graph: NNCFGraph, + statistic_points: Optional[StatisticPointsContainer] = None, + dataset: Optional[Dataset] = None, + ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]: + """ + Estimates better scale for the int4 nodes in the model. + Minimizes per-group difference between floating point MatMul and + MatMul with compressed weights. + The algorithm computes weighted scale for the group of weights in MatMul, which + shared the same scale. + + :param model: Model for applying algorithm. + :param graph: Model graph. + :param statistic_points: Statistic points with collected statistics values. + :param dataset: A representative dataset for the calibration process. + :return: Two dictionaries for estimated scales and zero points for each weight name. + """ + + scales, zero_points = dict(), dict() + + for wp in track(self._all_weight_params, description="Applying Scale Estimation"): + weight_name = wp.weight_name + node_name = wp.node_with_weight.node_name + config = wp.compression_config + + if config.num_bits != 4 or node_name not in self._statistics: + scales[weight_name] = None + continue + + stats = self._statistics[node_name] + + weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) + if len(weight_data) != 1: # not supported by the algorithm + continue + _, weight_port_id = weight_data[0] + + weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) + + scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params( + self._backend_entity, + stats, + weight, + wp.reduction_axes, + config, + self._subset_size, + self._initial_steps, + self._scale_steps, + self._weight_penalty, + ) + + return scales, zero_points + + @staticmethod + def calculate_quantization_params( + backend_entity: WeightCompressionAlgoBackend, + statistics: WCTensorStatistic, + weight: Tensor, + reduction_axes: Tuple[int, ...], + config: WeightCompressionConfig, + subset_size: int = 32, + initial_steps: int = 5, + scale_steps: int = 10, + weight_penalty: float = -1.0, + ) -> Tensor: + """ + Calculates the quantization parameters for a given set of weights and activations. + This function estimates the optimal quantization scale for weight compression by + minimizing the difference between floating-point operations and operations with + quantized weights. + + The function uses an iterative process: + 1. Initial scale rectification based on activation statistics. + 2. A grid search to further refine the scale parameters. + + :param backend_entity: The backend-specific implementation of the weight compression algorithm. + :param statistics: The input activations of the layer reduced over batch and sequence length dimensions, + together with original activation tensor shapes. + :param weight: The weight tensor that is being quantized. + :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization. + :param config: Configuration parameters for the weight compression, including quantization settings. + :param subset_size: The number of samples to use for scale estimation. Defaults to 32. + :param initial_steps: The number of steps for initial scale rectification using activation statistics. + Defaults to 5. + :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10. + :param weight_penalty: Penalty coefficient applied to the difference between floating-point + and quantized weights. A value of -1 disables the penalty. Defaults to -1.0. + :return: A tensor containing the calculated quantization scales and zero points if applicable. + """ + reduction_axis = reduction_axes[0] + + s, X = process_stats(statistics, subset_size) + + weight = weight.astype(TensorDataType.float32) + eps = fns.finfo(weight).eps + + if reduction_axis == 0: + weight = fns.transpose(weight) + reduction_axis = 1 + + group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] + cur_config = deepcopy(config) + cur_config.group_size = group_size + + original_weight = fns.zeros_like(weight) + weight + if config.mode == CompressWeightsMode.NF4: + norm_weight, scale = calculate_normalized_weight_and_fp4_scale( + original_weight, reduction_axis, cur_config.group_size + ) + compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True) + q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis) + zp = None + else: + compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis, is_numpy=True) + if zp is not None: + zp = zp.astype(scale.dtype) + q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) + + s = fns.unsqueeze(s, 0) + s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) + + original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) + + # all weight in group has importance based on corresponding input activations + importance = fns.ones_like(original_weight) + importance = importance * s + + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + importance = fns.where(zero_mask, 0.0, importance) + + # normalize importances for every group of weights to make sum of them equal to 1.0 + denum = fns.sum(importance, axis=2, keepdims=True) + importance = importance / (denum + eps) + + X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) + q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) + best_diffs = None + result_scale = None + + fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) + q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) + + # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE + min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) + + zp_shape = zp.shape if zp is not None else None + key = (config.mode, config.num_bits) + q_weights.shape + scale.shape + if zp is not None: + key += zp_shape + if config.mode != CompressWeightsMode.NF4: + if key in ScaleEstimationOld.compress_decompress_cache: + compress_decompress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_decompress_model"] + compress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_model"] + else: + compress_decompress_model = backend_entity.get_compress_decompress_pipeline( + config, q_weights.shape, scale.shape, zp_shape + ) + compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape) + ScaleEstimationOld.compress_decompress_cache[key] = { + "compress_decompress_model": compress_decompress_model, + "compress_model": compress_model, + } + scale_sign = scale / fns.abs(scale) + zero_scale = 0.001 + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + + input_tensors = [original_weight.data, None] + if zp is not None: + input_tensors.append(zp.data) + # iterative rectification of initial scale + for i in range(initial_steps): + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + input_tensors[1] = near_to_ideal_scale.data + + if config.mode == CompressWeightsMode.NF4: + g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) + out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) + else: + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + + if best_diffs is None: + best_diffs = min_max_scale_diffs + + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale + input_tensors[1] = near_to_ideal_scale.data + + if i < initial_steps - 1: + if config.mode == CompressWeightsMode.NF4: + out = do_nf4_quantization(original_weight, near_to_ideal_scale) + else: + out = compress_model(input_tensors) + compressed_weights = fns.zeros_like(original_weight) + out + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + + # iterative rectification of scale based on grid search + for scale_step in range(scale_steps): + factor = 1.0 - 0.05 * scale_step + scaled_scale = factor * scale + + input_tensors[1] = scaled_scale.data + if config.mode == CompressWeightsMode.NF4: + out = do_nf4_quantization(original_weight, scaled_scale) + else: + out = compress_model(input_tensors) + compressed_weights = fns.zeros_like(original_weight) + out + + target, zero_mask = get_target_zero_mask(compressed_weights, zp) + zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + near_to_ideal_scale = near_to_ideal_scale * scale_sign + + input_tensors[1] = near_to_ideal_scale.data + if config.mode == CompressWeightsMode.NF4: + g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) + out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) + else: + out = compress_decompress_model(input_tensors) + q_weights_ = fns.zeros_like(original_weight) + out + + q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) + ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) + ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) + if weight_penalty > 0.0: + ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) + + mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) + + best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs + + mask = fns.unsqueeze(mask, axis=2) + + if result_scale is None: + near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale + else: + near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale + result_scale = near_to_ideal_scale + + if config.group_size == -1: + result_scale = fns.squeeze(result_scale, axis=1) + if zp is not None and config.group_size == -1: + zp = fns.squeeze(zp, axis=1) + + return result_scale, zp + + @staticmethod + def activations_to_wc_statistics(activations: List[Tensor]) -> WCTensorStatistic: + """ + Mimic the activation reducing logic from WeightCompression.get_statistic_points. + + :param activations: List of raw activations. + :return: Instance of WCTensorStatistic class containing reduced activations and shapes. + """ + mean_values = [] + shapes = [] + for act in activations: + shapes.append(act.shape) + reduction_shape = tuple(range(act.ndim - 1)) + mean_values.append(fns.mean(act, axis=reduction_shape)) + wc_statistics = WCTensorStatistic(mean_values, shapes) + return wc_statistics + + +def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: + """ + Computes the target values and a mask indicating zero values in the target. + + :param compressed_weights: The compressed weights tensor. + :param zp: The zero point tensor. + :return: The compressed weights optionally adjusted by the zero point and + a boolean mask indicating positions in the target that are close to zero. + """ + target = compressed_weights + if zp is not None: + target = target.astype(dtype=zp.dtype) - zp + zero_mask = fns.isclose(target, 0) + return target, zero_mask + + +def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor: + """ + Estimates scales for the given weight, target, zero mask, and importance. + + :param weight: The weights tensor. + :param target: The target values tensor. + :param zero_mask: A boolean mask indicating positions in the target that are close to zero. + :param importance: The importance values tensor. + :return: The estimated scales + """ + ideal_scale = fns.abs(weight) / (fns.abs(target) + zero_mask) + weighted_scale = ideal_scale * importance + near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True) + return near_to_ideal_scale \ No newline at end of file diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 227ac16342c..0447d8db90b 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -433,6 +433,7 @@ def do_int_quantization( precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, ov_model_params: Optional = None, + is_numpy: bool = False, ) -> Tuple[Tensor, Tensor, Tensor]: """ Performs integer quantization on the given weight tensor. @@ -453,7 +454,7 @@ def do_int_quantization( "for asymmetric quantization." ) - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Running time may be improved after installing OpenVINO") @@ -545,6 +546,7 @@ def calculate_quantized_dequantized_weight( precomputed_zero_point: Optional[Tensor] = None, return_compressed_weight: Optional[bool] = False, ov_model_params: Optional = None, + is_numpy: bool = False, ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: """ First quantizes the given weight tensor and then dequantizes it back to obtain float32 values. @@ -560,7 +562,7 @@ def calculate_quantized_dequantized_weight( :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale, (and zero point). """ - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") From 6ab1c0847a8524ae3a348d52fe19333611e6602f Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 17:07:33 +0100 Subject: [PATCH 69/73] Cleanup --- .../weight_compression/openvino_backend.py | 58 --- .../weight_compression/openvino_modeling.py | 10 +- .../weight_compression/scale_estimation.py | 109 +---- .../scale_estimation_old.py | 424 ------------------ .../weight_compression/weight_lowering.py | 8 +- 5 files changed, 30 insertions(+), 579 deletions(-) delete mode 100644 nncf/quantization/algorithms/weight_compression/scale_estimation_old.py diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index bfe00223755..0eaa6b72532 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -352,64 +352,6 @@ def dump_parameters( def __del__(self): clear_ov_model_cache() - @staticmethod - def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None): - from openvino.properties.hint import inference_precision - import openvino as ov - - parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline( - config, w_shape, s_shape, z_p_shape, True - ) - - if len(parameters) == 3: - _, s, zp = parameters - result = (clamp - zp) * s - else: - s = parameters[1] - result = clamp * s - - model = ov.Model([result], parameters) - - compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32}) - - return lambda parameters: compiled_model(parameters)[0] - - @staticmethod - def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, - return_nodes=False): - from openvino.properties.hint import inference_precision - import openvino as ov - - mode = config.mode - assert mode in [ - CompressWeightsMode.INT4_SYM, - CompressWeightsMode.INT4_ASYM, - ], f"Only int4 supported, but given={mode}" - num_bits = config.num_bits - - asym_quant = mode in [CompressWeightsMode.INT4_ASYM] - level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) - level_high = 2 ** num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 - - w = opset.parameter(w_shape, name="w") - s = opset.parameter(s_shape, name="s") - parameters = [w, s] - compressed_w = w / s - if z_p_shape is not None: - zp = opset.parameter(z_p_shape, name="zp") - parameters.append(zp) - compressed_w += zp - - result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") - - if return_nodes: - return parameters, result - - model = ov.Model([result], parameters) - - compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32}) - - return lambda parameters: compiled_model(parameters)[0] class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend): diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 5092eb61978..a9c569ea663 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -10,7 +10,6 @@ # limitations under the License. import copy -import os from dataclasses import dataclass from functools import partial from typing import Callable, Dict, List, Optional, Tuple, Union @@ -32,6 +31,7 @@ TensorList = List[Tensor] ModelCallable = Callable[[TensorList], TensorList] +ReductionAxes = Union[int, Tuple[int, ...]] OV_MODEL_CACHE = ResultsCacheContainer() @@ -173,7 +173,7 @@ def get_compress_weight_model( weight_shape: Tuple, scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, - reduction_axes: Optional[Tuple] = None, + reduction_axes: Optional[ReductionAxes] = None, return_nodes: Optional[bool] = False, ) -> Union[ModelCallable, ModelAsNodes]: """ @@ -222,7 +222,7 @@ def get_compress_decompress_weight_model( weight_shape: Tuple, scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, - reduction_axes: Optional[Tuple] = None, + reduction_axes: Optional[ReductionAxes] = None, return_compressed_weight: Optional[bool] = False, ) -> ModelCallable: """ @@ -270,7 +270,7 @@ def _build_compress_model( weight_shape: Tuple, scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, - reduction_axes: Optional[Tuple] = None, + reduction_axes: Optional[ReductionAxes] = None, return_nodes: bool = False, ) -> Union[ModelCallable, ModelAsNodes]: is_int_asym = config.is_int_asym @@ -415,7 +415,7 @@ def _build_compress_decompress_model( weight_shape: Tuple, scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, - reduction_axes: Optional[Tuple] = None, + reduction_axes: Optional[ReductionAxes] = None, return_compressed_weight: Optional[bool] = False, ) -> ModelCallable: default_output_dtypes = {"decompressed_weight": TensorDataType.float32} diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 948c04be951..3330b1f7279 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -8,12 +8,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os + from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple, TypeVar -import numpy as np - import nncf from nncf import Dataset from nncf.common.graph.graph import NNCFGraph @@ -28,8 +26,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale, \ - do_int_dequantization +from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization @@ -42,61 +39,6 @@ TModel = TypeVar("TModel") -def compare_tensors( - weight, - data_ref: Dict["str", Tensor], - data_actual: Dict["str", Tensor], - node_name: str, - rel_diff_threshold=5e-1, - verbosity=2 -): - hists = {} - stats = {} - for name in data_ref: - ref = data_ref[name] - actual = data_actual[name] - try: - np.testing.assert_allclose(actual.data, ref.data, atol=0, rtol=0) - except Exception as e: - not_equal = np.where(ref.data != actual.data) - diff = fns.abs(ref - actual).data[not_equal] - rel_diff = diff / fns.maximum(fns.abs(ref).data[not_equal], 1e-9) - stats[name] = (np.median(rel_diff), rel_diff.max(), len(not_equal[0]) / ref.size) - - is_fp32 = ref.dtype == TensorDataType.float32 - bins = np.logspace(-10, 2,) if is_fp32 else np.arange(17) - hists[name] = np.histogram(diff, bins=bins, density=False) - - if verbosity > 0: - print() - print(node_name, name) - print(str(e).replace("Not equal to tolerance rtol=1e-07, atol=0", "").strip()) - if verbosity > 1: - # format_str = "{:.2e}" - format_str = "{:.10f}" - zip_arg = ( - rel_diff.tolist(), - [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in ref.data[not_equal].tolist()], - [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in actual.data[not_equal].tolist()], - ) - if weight is not None: - # zip_arg += ([f"{it:.30f}" for it in weight.data[not_equal].tolist()],) - zip_arg += ([it for it in weight.data[not_equal].tolist()],) - data = list(zip(*zip_arg)) - data = list(filter(lambda it: it[0] > (rel_diff_threshold if is_fp32 else 1), data)) - if len(data) > 0: - data = sorted(data, key=lambda it: it[0], reverse=True) - data = list(zip(*data)) - print("Rel. diff:", [f"{it:.2e}" if is_fp32 else int(it) for it in data[0]][:100]) - print("Reference:", data[1][:100]) - print("Actual:", data[2][:100]) - if weight is not None: - print("Weight:", data[3][:100]) - if verbosity > 0: - print('-' * 50) - return hists, stats - - class ScaleEstimation: """ Scale estimation algorithm implementation. @@ -138,9 +80,6 @@ def __init__( self._set_backend_entity(model) - from nncf.quantization.algorithms.weight_compression.scale_estimation_old import ScaleEstimationOld - self.se_old = ScaleEstimationOld(model, name_to_node_mapping, all_weight_params, nodes_to_compress, statistics, subset_size, initial_steps, scale_steps, weight_penalty) - @property def available_backends(self) -> List[BackendType]: return [BackendType.OPENVINO] @@ -217,24 +156,6 @@ def apply( self._weight_penalty, ) - scale_, zero_point = self.se_old.calculate_quantization_params( - self._backend_entity, - stats, - weight, - wp.reduction_axes, - config, - self._subset_size, - self._initial_steps, - self._scale_steps, - self._weight_penalty, - ) - compare_tensors( - None, - {"scale": scale_, "zero_point": zero_point}, - {"scale": scales[weight_name], "zero_point": zero_points[weight_name]}, - node_name - ) - return scales, zero_points @staticmethod @@ -348,8 +269,11 @@ def calculate_quantization_params( out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: out = calculate_quantized_dequantized_weight( - original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, - ov_model_params=ov_model_params + original_weight, + config, + precomputed_scale=near_to_ideal_scale, + precomputed_zero_point=zp, + ov_model_params=ov_model_params, ) q_weights_ = fns.zeros_like(original_weight) + out @@ -380,8 +304,11 @@ def calculate_quantization_params( out = do_nf4_quantization(original_weight, near_to_ideal_scale) else: out, _, _ = do_int_quantization( - original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, - ov_model_params=ov_model_params + original_weight, + config, + precomputed_scale=near_to_ideal_scale, + precomputed_zero_point=zp, + ov_model_params=ov_model_params, ) compressed_weights = fns.zeros_like(original_weight) + out target, zero_mask = get_target_zero_mask(compressed_weights, zp) @@ -396,7 +323,11 @@ def calculate_quantization_params( out = do_nf4_quantization(original_weight, scaled_scale) else: out, _, _ = do_int_quantization( - original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params, + original_weight, + config, + precomputed_scale=scaled_scale, + precomputed_zero_point=zp, + ov_model_params=ov_model_params, ) compressed_weights = fns.zeros_like(original_weight) + out @@ -410,7 +341,11 @@ def calculate_quantization_params( out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) else: out = calculate_quantized_dequantized_weight( - original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params + original_weight, + config, + precomputed_scale=near_to_ideal_scale, + precomputed_zero_point=zp, + ov_model_params=ov_model_params, ) q_weights_ = fns.zeros_like(original_weight) + out diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py b/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py deleted file mode 100644 index 88455f5b651..00000000000 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py +++ /dev/null @@ -1,424 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple, TypeVar - -import nncf -from nncf import Dataset -from nncf.common.graph.graph import NNCFGraph -from nncf.common.graph.graph import NNCFNode -from nncf.common.logging.track_progress import track -from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer -from nncf.common.utils.backend import BackendType -from nncf.common.utils.backend import get_backend -from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic -from nncf.parameters import CompressWeightsMode -from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats -from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig -from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization -from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization -from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization -from nncf.tensor import Tensor -from nncf.tensor import TensorDataType -from nncf.tensor import functions as fns - -TModel = TypeVar("TModel") - - -class ScaleEstimationOld: - """ - Scale estimation algorithm implementation. - """ - - compress_decompress_cache = {} - - def __init__( - self, - model: TModel, - name_to_node_mapping: Dict[str, Any], - all_weight_params: List[WeightCompressionParameters], - nodes_to_compress: List[NNCFNode], - statistics: Dict[str, WCTensorStatistic], - subset_size: int = 32, - initial_steps: int = 5, - scale_steps: int = 10, - weight_penalty: float = -1.0, - ): - """ - :param model: Model for applying algorithm. - :param name_to_node_mapping: Name to node mapping for updating node weights. - :param all_weight_params: List of all weight parameters. - :param nodes_to_compress: List of nodes for processing. - :param statistics: Input activation statistics for each node. - :param subset_size: The number of samples for scale estimation. - :param initial_steps: The number of the steps for absmax scale rectification. - :param scale_steps: The number of the steps for grid search scale rectification - from 1.0 to 1.0 - 0.05 * scale_step. - :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply. - """ - super().__init__() - self.name_to_node_mapping = name_to_node_mapping - self._all_weight_params = all_weight_params - self._nodes_to_compress = nodes_to_compress - self._statistics = statistics - self._subset_size = subset_size - self._initial_steps = initial_steps - self._scale_steps = scale_steps - self._weight_penalty = weight_penalty - - self._set_backend_entity(model) - - @property - def available_backends(self) -> List[BackendType]: - return [BackendType.OPENVINO] - - def _set_backend_entity(self, model: TModel) -> None: - """ - Creates a helper class with a backed-specific logic of the algorithm. - - :param model: Backend-specific input model. - :param all_weight_params: List of all weight parameters. - :param nodes_to_compress: List of nodes for processing. - :param activations: The input activations of the layers considered for compression. - """ - - model_backend = get_backend(model) - if model_backend == BackendType.OPENVINO: - from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend - - self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping) - else: - raise nncf.UnsupportedBackendError( - "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value) - ) - - def apply( - self, - model: TModel, - graph: NNCFGraph, - statistic_points: Optional[StatisticPointsContainer] = None, - dataset: Optional[Dataset] = None, - ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]: - """ - Estimates better scale for the int4 nodes in the model. - Minimizes per-group difference between floating point MatMul and - MatMul with compressed weights. - The algorithm computes weighted scale for the group of weights in MatMul, which - shared the same scale. - - :param model: Model for applying algorithm. - :param graph: Model graph. - :param statistic_points: Statistic points with collected statistics values. - :param dataset: A representative dataset for the calibration process. - :return: Two dictionaries for estimated scales and zero points for each weight name. - """ - - scales, zero_points = dict(), dict() - - for wp in track(self._all_weight_params, description="Applying Scale Estimation"): - weight_name = wp.weight_name - node_name = wp.node_with_weight.node_name - config = wp.compression_config - - if config.num_bits != 4 or node_name not in self._statistics: - scales[weight_name] = None - continue - - stats = self._statistics[node_name] - - weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) - if len(weight_data) != 1: # not supported by the algorithm - continue - _, weight_port_id = weight_data[0] - - weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - - scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params( - self._backend_entity, - stats, - weight, - wp.reduction_axes, - config, - self._subset_size, - self._initial_steps, - self._scale_steps, - self._weight_penalty, - ) - - return scales, zero_points - - @staticmethod - def calculate_quantization_params( - backend_entity: WeightCompressionAlgoBackend, - statistics: WCTensorStatistic, - weight: Tensor, - reduction_axes: Tuple[int, ...], - config: WeightCompressionConfig, - subset_size: int = 32, - initial_steps: int = 5, - scale_steps: int = 10, - weight_penalty: float = -1.0, - ) -> Tensor: - """ - Calculates the quantization parameters for a given set of weights and activations. - This function estimates the optimal quantization scale for weight compression by - minimizing the difference between floating-point operations and operations with - quantized weights. - - The function uses an iterative process: - 1. Initial scale rectification based on activation statistics. - 2. A grid search to further refine the scale parameters. - - :param backend_entity: The backend-specific implementation of the weight compression algorithm. - :param statistics: The input activations of the layer reduced over batch and sequence length dimensions, - together with original activation tensor shapes. - :param weight: The weight tensor that is being quantized. - :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization. - :param config: Configuration parameters for the weight compression, including quantization settings. - :param subset_size: The number of samples to use for scale estimation. Defaults to 32. - :param initial_steps: The number of steps for initial scale rectification using activation statistics. - Defaults to 5. - :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10. - :param weight_penalty: Penalty coefficient applied to the difference between floating-point - and quantized weights. A value of -1 disables the penalty. Defaults to -1.0. - :return: A tensor containing the calculated quantization scales and zero points if applicable. - """ - reduction_axis = reduction_axes[0] - - s, X = process_stats(statistics, subset_size) - - weight = weight.astype(TensorDataType.float32) - eps = fns.finfo(weight).eps - - if reduction_axis == 0: - weight = fns.transpose(weight) - reduction_axis = 1 - - group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis] - cur_config = deepcopy(config) - cur_config.group_size = group_size - - original_weight = fns.zeros_like(weight) + weight - if config.mode == CompressWeightsMode.NF4: - norm_weight, scale = calculate_normalized_weight_and_fp4_scale( - original_weight, reduction_axis, cur_config.group_size - ) - compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True) - q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis) - zp = None - else: - compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis, is_numpy=True) - if zp is not None: - zp = zp.astype(scale.dtype) - q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis) - - s = fns.unsqueeze(s, 0) - s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size) - - original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size) - - # all weight in group has importance based on corresponding input activations - importance = fns.ones_like(original_weight) - importance = importance * s - - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - importance = fns.where(zero_mask, 0.0, importance) - - # normalize importances for every group of weights to make sum of them equal to 1.0 - denum = fns.sum(importance, axis=2, keepdims=True) - importance = importance / (denum + eps) - - X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size) - q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size) - best_diffs = None - result_scale = None - - fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X) - q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X) - - # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE - min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0)) - if weight_penalty > 0.0: - min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - - zp_shape = zp.shape if zp is not None else None - key = (config.mode, config.num_bits) + q_weights.shape + scale.shape - if zp is not None: - key += zp_shape - if config.mode != CompressWeightsMode.NF4: - if key in ScaleEstimationOld.compress_decompress_cache: - compress_decompress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_decompress_model"] - compress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_model"] - else: - compress_decompress_model = backend_entity.get_compress_decompress_pipeline( - config, q_weights.shape, scale.shape, zp_shape - ) - compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape) - ScaleEstimationOld.compress_decompress_cache[key] = { - "compress_decompress_model": compress_decompress_model, - "compress_model": compress_model, - } - scale_sign = scale / fns.abs(scale) - zero_scale = 0.001 - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - - input_tensors = [original_weight.data, None] - if zp is not None: - input_tensors.append(zp.data) - # iterative rectification of initial scale - for i in range(initial_steps): - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - input_tensors[1] = near_to_ideal_scale.data - - if config.mode == CompressWeightsMode.NF4: - g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) - out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) - else: - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) - - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if weight_penalty > 0.0: - ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) - - if best_diffs is None: - best_diffs = min_max_scale_diffs - - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) - - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs - - mask = fns.unsqueeze(mask, axis=2) - - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale - input_tensors[1] = near_to_ideal_scale.data - - if i < initial_steps - 1: - if config.mode == CompressWeightsMode.NF4: - out = do_nf4_quantization(original_weight, near_to_ideal_scale) - else: - out = compress_model(input_tensors) - compressed_weights = fns.zeros_like(original_weight) + out - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - - # iterative rectification of scale based on grid search - for scale_step in range(scale_steps): - factor = 1.0 - 0.05 * scale_step - scaled_scale = factor * scale - - input_tensors[1] = scaled_scale.data - if config.mode == CompressWeightsMode.NF4: - out = do_nf4_quantization(original_weight, scaled_scale) - else: - out = compress_model(input_tensors) - compressed_weights = fns.zeros_like(original_weight) + out - - target, zero_mask = get_target_zero_mask(compressed_weights, zp) - zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - near_to_ideal_scale = near_to_ideal_scale * scale_sign - - input_tensors[1] = near_to_ideal_scale.data - if config.mode == CompressWeightsMode.NF4: - g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale) - out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale) - else: - out = compress_decompress_model(input_tensors) - q_weights_ = fns.zeros_like(original_weight) + out - - q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) - ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1) - ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0)) - if weight_penalty > 0.0: - ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1) - - mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype) - - best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs - - mask = fns.unsqueeze(mask, axis=2) - - if result_scale is None: - near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale - else: - near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale - result_scale = near_to_ideal_scale - - if config.group_size == -1: - result_scale = fns.squeeze(result_scale, axis=1) - if zp is not None and config.group_size == -1: - zp = fns.squeeze(zp, axis=1) - - return result_scale, zp - - @staticmethod - def activations_to_wc_statistics(activations: List[Tensor]) -> WCTensorStatistic: - """ - Mimic the activation reducing logic from WeightCompression.get_statistic_points. - - :param activations: List of raw activations. - :return: Instance of WCTensorStatistic class containing reduced activations and shapes. - """ - mean_values = [] - shapes = [] - for act in activations: - shapes.append(act.shape) - reduction_shape = tuple(range(act.ndim - 1)) - mean_values.append(fns.mean(act, axis=reduction_shape)) - wc_statistics = WCTensorStatistic(mean_values, shapes) - return wc_statistics - - -def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: - """ - Computes the target values and a mask indicating zero values in the target. - - :param compressed_weights: The compressed weights tensor. - :param zp: The zero point tensor. - :return: The compressed weights optionally adjusted by the zero point and - a boolean mask indicating positions in the target that are close to zero. - """ - target = compressed_weights - if zp is not None: - target = target.astype(dtype=zp.dtype) - zp - zero_mask = fns.isclose(target, 0) - return target, zero_mask - - -def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor: - """ - Estimates scales for the given weight, target, zero mask, and importance. - - :param weight: The weights tensor. - :param target: The target values tensor. - :param zero_mask: A boolean mask indicating positions in the target that are close to zero. - :param importance: The importance values tensor. - :return: The estimated scales - """ - ideal_scale = fns.abs(weight) / (fns.abs(target) + zero_mask) - weighted_scale = ideal_scale * importance - near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True) - return near_to_ideal_scale \ No newline at end of file diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 0447d8db90b..b6a6c218809 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -26,7 +26,7 @@ from nncf.tensor.definitions import TensorBackend from nncf.tensor.definitions import TensorDataType -ReductionAxes = Tuple[int, ...] +ReductionAxes = Union[int, Tuple[int, ...]] NF4_QUANTILES = np.array( [ @@ -433,7 +433,6 @@ def do_int_quantization( precomputed_scale: Tensor = None, precomputed_zero_point: Tensor = None, ov_model_params: Optional = None, - is_numpy: bool = False, ) -> Tuple[Tensor, Tensor, Tensor]: """ Performs integer quantization on the given weight tensor. @@ -454,7 +453,7 @@ def do_int_quantization( "for asymmetric quantization." ) - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Running time may be improved after installing OpenVINO") @@ -546,7 +545,6 @@ def calculate_quantized_dequantized_weight( precomputed_zero_point: Optional[Tensor] = None, return_compressed_weight: Optional[bool] = False, ov_model_params: Optional = None, - is_numpy: bool = False, ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: """ First quantizes the given weight tensor and then dequantizes it back to obtain float32 values. @@ -562,7 +560,7 @@ def calculate_quantized_dequantized_weight( :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale, (and zero point). """ - accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy + accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch if not is_openvino_available() and weight.backend != TensorBackend.torch: log_once(logging.INFO, "Compression time may be improved after installing OpenVINO") From a0fe91a52c8cf38865cd65cc84bf80331f9bac9c Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 17:26:55 +0100 Subject: [PATCH 70/73] Add convertable division test --- .../openvino/native/test_openvino_modeling.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py index b4bb991d592..e41b39b2359 100644 --- a/tests/openvino/native/test_openvino_modeling.py +++ b/tests/openvino/native/test_openvino_modeling.py @@ -295,3 +295,25 @@ def test_share_inputs_outputs(mocker, share_inputs, share_outputs, return_ov_ten compiled_model.assert_called_once_with( [input_tensor.data], share_inputs=share_inputs, share_outputs=share_outputs ) + + +@pytest.mark.parametrize( + "weight,convertable_division,ref_compressed_weight", + [ + ([[0.70361328125, 0.92919921875, 0.37109375, -0.98974609375]], True, [[225, 255, 181, 0]]), + ([[0.70361328125, 0.92919921875, 0.37109375, -0.98974609375]], False, [[226, 255, 181, 0]]), + ], +) +def test_convertable_divison(weight, convertable_division, ref_compressed_weight): + ov_model_params = OVModelParameters( + input_dtypes={"weight": TensorDataType.float32}, + dynamic_shapes=not convertable_division, + convertable_division=convertable_division, + ) + config = WeightCompressionConfig(CompressWeightsMode.INT8_ASYM) + + weight = np.array(weight, np.float32) + ref_compressed_weight = np.array(ref_compressed_weight, np.uint8) + model_run_fn = get_compress_weight_model(ov_model_params, config, weight.shape, reduction_axes=(1,)) + compressed_weight = model_run_fn([Tensor(weight)])[0] + np.testing.assert_allclose(compressed_weight.data, ref_compressed_weight, atol=0, rtol=0) From 97bd61d5c46a2a278918251b15bf04f8dd6cec63 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 17:28:25 +0100 Subject: [PATCH 71/73] Add explicit inference precision --- .../weight_compression/openvino_modeling.py | 55 ++++++++++++------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index a9c569ea663..0abad80eb98 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -17,6 +17,7 @@ import numpy as np import openvino as ov from openvino._pyopenvino.op import Parameter +from openvino._pyopenvino.properties.hint import inference_precision from openvino.runtime import Node from openvino.runtime import opset13 as opset @@ -167,6 +168,32 @@ def _infer_ov_model( return outputs +def _prepare_compression_model_inputs( + ov_model_params, + weight_shape: Tuple, + scale_shape: Optional[Tuple], + zero_point_shape: Optional[Tuple], + reduction_axes: Optional[ReductionAxes], +) -> Tuple[Tuple, Optional[Tuple], Optional[Tuple]]: + """ + Do some input checks and convert static shapes to dynamic shapes if needed. + """ + if scale_shape is None and zero_point_shape is not None: + raise Exception("Zero point shape can only be provided if scale shape is provided.") + if scale_shape is None and reduction_axes is None: + raise ValueError("Reduction axes must be provided if scale shape is not provided.") + + # Set dynamic shapes if needed + if ov_model_params.dynamic_shapes: + weight_shape = (-1,) * len(weight_shape) + if scale_shape is not None: + scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) + if zero_point_shape is not None: + zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + + return weight_shape, scale_shape, zero_point_shape + + def get_compress_weight_model( ov_model_params: OVModelParameters, config: WeightCompressionConfig, @@ -193,16 +220,10 @@ def get_compress_weight_model( :return: A model callable that compresses weights using the given configuration. Or a model as nodes, if `return_nodes` is True. """ - if scale_shape is None and zero_point_shape is not None: - raise Exception("Zero point shape can only be provided if scale shape is provided.") - # Set dynamic shapes if needed - if ov_model_params.dynamic_shapes: - weight_shape = (-1,) * len(weight_shape) - if scale_shape is not None: - scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) - if zero_point_shape is not None: - zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + weight_shape, scale_shape, zero_point_shape = _prepare_compression_model_inputs( + ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes + ) return _build_compress_model( config, @@ -243,13 +264,9 @@ def get_compress_decompress_weight_model( (and zero point) if `return_compressed_weight` is True. """ - # Set dynamic shapes if needed - if ov_model_params.dynamic_shapes: - weight_shape = (-1,) * len(weight_shape) - if scale_shape is not None: - scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) - if zero_point_shape is not None: - zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + weight_shape, scale_shape, zero_point_shape = _prepare_compression_model_inputs( + ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes + ) return _build_compress_decompress_model( config, @@ -403,7 +420,7 @@ def _build_compress_model( return ov_parameters, ov_results, ov_model_params model = ov.Model(ov_results, ov_parameters) - compiled_model = ov.compile_model(model, device_name="CPU") + compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32}) return partial(_infer_ov_model, ov_model_params, compiled_model) @@ -459,7 +476,7 @@ def _build_compress_decompress_model( ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight] model = ov.Model(ov_results, ov_parameters) - compiled_model = ov.compile_model(model, device_name="CPU") + compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32}) return partial(_infer_ov_model, ov_model_params, compiled_model) @@ -497,6 +514,6 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[input_dtypes["input"]], name="input") res = opset.convert(arg, DTYPE_MAP_OV[output_dtypes["output"]]) model = ov.Model([res], [arg]) - compiled_model = ov.compile_model(model, device_name="CPU") + compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32}) return partial(_infer_ov_model, ov_model_params, compiled_model) From 58963abffee560316e5bb1ef5c89eb6eba1df23a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 18:10:07 +0100 Subject: [PATCH 72/73] Fix import --- .../algorithms/weight_compression/scale_estimation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 3330b1f7279..38116efa1a1 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -21,11 +21,11 @@ from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic +from nncf.import_utils import is_openvino_available from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters -from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization @@ -256,8 +256,13 @@ def calculate_quantization_params( zero_scale = 0.001 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) - # This is required for alignment with a previous OpenVINO models implementation - ov_model_params = OVModelParameters(dynamic_shapes=False, convertable_division=True) + if is_openvino_available(): + # This is required for alignment with a previous OpenVINO models implementation + from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters + + ov_model_params = OVModelParameters(dynamic_shapes=False, convertable_division=True) + else: + ov_model_params = None # iterative rectification of initial scale for i in range(initial_steps): From ec21996f4c7130347e09dc0f4ba14d4050b3ff38 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 16 Dec 2024 21:21:40 +0100 Subject: [PATCH 73/73] Update tests/post_training/data/wc_reference_data.yaml --- tests/post_training/data/wc_reference_data.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 683dc62f401..6c48904c91a 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -23,7 +23,7 @@ tinyllama_int8_data_free_backend_TORCH: num_int4: 0 num_int8: 312 tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV: - metric_value: 0.88669 + metric_value: 0.86503 num_int4: 94 num_int8: 124 metrics_xfail_reason: "Issue-148819"