diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py index 39056d65af5..33d67140d16 100644 --- a/nncf/openvino/graph/node_utils.py +++ b/nncf/openvino/graph/node_utils.py @@ -8,7 +8,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os + from typing import Any, Callable, Dict, List, Optional, Tuple, Type import numpy as np @@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int: return cnt_if_op(model, 0) -def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray: +def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = False) -> np.ndarray: """ Returns the constant tensor for the node. This method is applicable only for the floating-point constant data. diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index dcfcb35bae7..4920f21d9fd 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -296,8 +296,10 @@ def transform_model( const_node_output = const_node.output(0) const_dtype = const_node_output.get_element_type() weight = get_const_value(const_node, cast_bf16_to_fp32=False) - if const_dtype == ov.Type.bf16: - weight = ov.Tensor(weight, weight.shape, ov.Type.bf16) + # Creation of ov.Tensor is required for two reasons: + # 1. To be able to process BF16 weight properly + # 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed + weight = ov.Tensor(weight, weight.shape, const_dtype) weight = Tensor(weight) should_add_convert_node = False diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py index 1008e872ba7..f12a3fa7a7b 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py @@ -11,7 +11,7 @@ from dataclasses import dataclass from functools import partial -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple, Union import numpy as np import openvino as ov @@ -35,6 +35,7 @@ @dataclass class OVModelParameters: input_dtype: TensorDataType + output_dtype: Optional[TensorDataType] = None dynamic_shapes: bool = False recompile: bool = False release_memory: bool = True @@ -56,30 +57,28 @@ def __hash__(self): ) -def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList: - # Returns results as numpy tensors +def run_model( + ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList, return_ov_tensors: bool +) -> TensorList: if any(isinstance(it, Tensor) for it in inputs): inputs = [inp.data for inp in inputs] - outputs = compiled_model( - inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs - ) - outputs = [Tensor(outputs[i]) for i in range(len(outputs))] - if ov_model_params.release_memory: - compiled_model.release_memory() - return outputs + if return_ov_tensors: + infer_request = compiled_model.create_infer_request() + infer_request.infer( + inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs + ) + outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))] + else: + outputs = compiled_model( + inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs + ) + outputs = [outputs[i] for i in range(len(outputs))] + outputs = [Tensor(it) for it in outputs] -def run_model_via_infer_request( - ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList -) -> TensorList: - # Returns results as ov tensors - if any(isinstance(it, Tensor) for it in inputs): - inputs = [inp.data for inp in inputs] - infer_request = compiled_model.create_infer_request() - infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs) - outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))] if ov_model_params.release_memory: compiled_model.release_memory() + return outputs @@ -93,8 +92,6 @@ def get_compress_weight_model( ) -> ModelCallable: if scale_shape is None and zero_point_shape is not None: raise Exception("Zero point shape can only be provided if scale shape is provided.") - # if (scale_shape is None) != (reduction_axes is not None): - # raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.") if ov_model_params.dynamic_shapes: weight_shape = (-1,) * len(weight_shape) @@ -103,9 +100,6 @@ def get_compress_weight_model( if zero_point_shape is not None: zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) - if config.num_bits == 4: - ov_model_params.return_ov_tensors = True - return _build_compress_model( config, ov_model_params, @@ -150,28 +144,29 @@ def _build_compress_model( zero_point_shape: Optional[Tuple] = None, reduction_axes: Optional[Tuple] = None, return_nodes: bool = False, -) -> ModelCallable: +) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]: weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype]) ov_parameters = [weight] - if scale_shape is not None: - # Compute only the compressed weight + mode = config.mode + asym_mode = mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] + num_bits = config.num_bits + eps = np.finfo(np.float32).eps + if asym_mode: + level_low = 0 + level_high = 2**num_bits - 1 + else: + level_low = -(2 ** (num_bits - 1)) + level_high = 2 ** (num_bits - 1) - 1 + min_values = None + if scale_shape is not None: + # Scale is given as an input scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32) ov_parameters.append(scale) - - zero_point = None - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) - ov_parameters.append(zero_point) - zero_point = opset.convert(zero_point, ov.Type.f32) else: - # Compute compressed weight, scale and, possibly, zero point - - mode = config.mode - num_bits = config.num_bits - eps = np.finfo(np.float32).eps - if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + # Compute scale + if asym_mode: min_values = opset.reduce_min( weight, reduction_axes=reduction_axes, keep_dims=True ) # [a1, r, a2] -> [a1, 1, a2] @@ -180,49 +175,64 @@ def _build_compress_model( ) # [a1, r, a2] -> [a1, 1, a2] min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) - level_low = 0 - level_high = 2**num_bits - 1 levels = level_high - level_low + 1 scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32) scale = opset.select(opset.abs(scale) < eps, eps, scale) - - zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) - zero_point = opset.clamp(zero_point, level_low, level_high) else: - zero_point = None - level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32) - w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max) - scale /= level_high + scale /= opset.constant(level_high, ov.Type.f32) scale = opset.select(opset.abs(scale) < eps, eps, scale) + zero_point = None + if zero_point_shape is not None: + # Zero point is given as an input + zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32) + ov_parameters.append(zero_point) + zero_point = opset.convert(zero_point, ov.Type.f32) + elif mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + # Compute zero point + if min_values is None: + min_values = opset.reduce_min( + weight, reduction_axes=reduction_axes, keep_dims=True + ) # [a1, r, a2] -> [a1, 1, a2] + min_values = opset.convert(min_values, ov.Type.f32) + + level_low = 0 + level_high = 2**num_bits - 1 + zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) + zero_point = opset.clamp(zero_point, level_low, level_high) + if weight.get_element_type() != ov.Type.f32: weight = opset.convert(weight, ov.Type.f32) compressed_w = weight / scale - num_bits = config.num_bits - if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: - dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 - level_low = 0 - level_high = 2**num_bits - 1 + if asym_mode: + if ov_model_params.output_dtype is not None: + dtype = OV_DTYPE_MAP[ov_model_params.output_dtype] + else: + dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 compressed_w += zero_point - elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: - dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4 - level_low = -(2 ** (num_bits - 1)) - level_high = 2 ** (num_bits - 1) - 1 else: - raise Exception + if ov_model_params.output_dtype is not None: + dtype = OV_DTYPE_MAP[ov_model_params.output_dtype] + else: + dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4 compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high) compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights") ov_results = [compressed_w] - if len(ov_parameters) == 1: - ov_results.append(scale) + if len(ov_parameters) != 3: + # Two cases: + # 1. weight -> compressed_weight, scale, (zero_point) + # 2. weight, scale -> compressed_weight, (zero_point) + if len(ov_parameters) == 1: + ov_results.append(scale) + if zero_point is not None: ov_results.append(opset.convert(zero_point, compressed_w.get_element_type())) @@ -232,8 +242,7 @@ def _build_compress_model( model = ov.Model(ov_results, ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model - return partial(run_fn, ov_model_params, compiled_model) + return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) @cache_results(OV_MODEL_CACHE) @@ -249,25 +258,32 @@ def _build_compress_decompress_model( ) if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: - if len(ov_results) == 1: - compressed_w = ov_results[0] - s, zp = ov_parameters[1], ov_parameters[2] + if len(ov_parameters) == 1: + # weight -> compressed_weight, scale, zero_point + compressed_w, scale, zero_point = ov_results + elif len(ov_parameters) == 2: + # weight, scale -> compressed_weight, zero_point + compressed_w, zero_point = ov_results + scale = ov_parameters[1] else: - compressed_w, s, zp = ov_results - decompressed_w = (compressed_w - zp) * s - else: - if len(ov_results) == 1: + # weight, scale, zero_point -> compressed_weight compressed_w = ov_results[0] - s = ov_parameters[1] + scale, zero_point = ov_parameters[1:] + decompressed_w = opset.convert(opset.convert(compressed_w, ov.Type.i32) - zero_point, ov.Type.f32) * scale + else: + if len(ov_parameters) == 1: + # weight -> compressed_weight, scale + compressed_w, scale = ov_results else: - compressed_w, s = ov_results - decompressed_w = compressed_w * s + # weight, scale -> compressed_weight + compressed_w = ov_results[0] + scale = ov_parameters[1] + decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale model = ov.Model([decompressed_w], ov_parameters) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model - return partial(run_fn, ov_model_params, compiled_model) + return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable: @@ -283,5 +299,4 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dt model = ov.Model([res], [arg]) compiled_model = ov.compile_model(model, device_name="CPU") - run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model - return partial(run_fn, ov_model_params, compiled_model) + return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index c26f00634d9..8c2c5493a61 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -479,14 +479,23 @@ def do_int_quantization( scale_shape = None if precomputed_scale is None else precomputed_scale.shape zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape + asym_mode = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] if ov_model_params is None: - # ov_model_params = OVModelParameters(weight.dtype) + output_dtype = None + return_ov_tensors = False + if config.num_bits == 4: + if weight.backend == TensorBackend.ov: + return_ov_tensors = weight.backend == TensorBackend.ov + else: + output_dtype = TensorDataType.uint8 if asym_mode else TensorDataType.int8 ov_model_params = OVModelParameters( - weight.dtype, + input_dtype=weight.dtype, + output_dtype=output_dtype, dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))), recompile=bool(int(os.environ.get("RECOMPILE", "0"))), release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))), share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))), + return_ov_tensors=return_ov_tensors, ) model = get_compress_weight_model( @@ -499,14 +508,27 @@ def do_int_quantization( ) if precomputed_scale is None: - compressed_weight, scale, zero_point = model([weight]) + # weight -> compressed_weight, scale, (zero_point) + results = model([weight]) + if asym_mode: + compressed_weight, scale, zero_point = results + else: + compressed_weight, scale = results + zero_point = None + # Scale is always in fp32 so there is no need to store it in ov.Tensor if scale.backend == TensorBackend.ov: scale = scale.to_backend(TensorBackend.numpy) + elif precomputed_zero_point is None and asym_mode: + # weight, scale -> compressed_weight, zero_point + compressed_weight, zero_point = model([weight, precomputed_scale]) + scale = precomputed_scale else: - inputs = [weight, precomputed_scale] - if precomputed_zero_point is not None: - inputs += [precomputed_zero_point] + inputs = ( + [weight, precomputed_scale] + if precomputed_zero_point is None + else [weight, precomputed_scale, precomputed_zero_point] + ) compressed_weight = model(inputs)[0] scale, zero_point = precomputed_scale, precomputed_zero_point diff --git a/nncf/results_caching.py b/nncf/results_caching.py index 4a991a36be7..d1d16ea775b 100644 --- a/nncf/results_caching.py +++ b/nncf/results_caching.py @@ -1,3 +1,14 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import inspect