Skip to content

Commit

Permalink
Support case of (weight, scale) -> (c_weight, zp)
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Oct 26, 2024
1 parent fdc9283 commit ca3447c
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 86 deletions.
4 changes: 2 additions & 2 deletions nncf/openvino/graph/node_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

from typing import Any, Callable, Dict, List, Optional, Tuple, Type

import numpy as np
Expand Down Expand Up @@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
return cnt_if_op(model, 0)


def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray:
def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = False) -> np.ndarray:
"""
Returns the constant tensor for the node.
This method is applicable only for the floating-point constant data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,10 @@ def transform_model(
const_node_output = const_node.output(0)
const_dtype = const_node_output.get_element_type()
weight = get_const_value(const_node, cast_bf16_to_fp32=False)
if const_dtype == ov.Type.bf16:
weight = ov.Tensor(weight, weight.shape, ov.Type.bf16)
# Creation of ov.Tensor is required for two reasons:
# 1. To be able to process BF16 weight properly
# 2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
weight = ov.Tensor(weight, weight.shape, const_dtype)
weight = Tensor(weight)

should_add_convert_node = False
Expand Down
167 changes: 91 additions & 76 deletions nncf/quantization/algorithms/weight_compression/openvino_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from dataclasses import dataclass
from functools import partial
from typing import Callable, List, Optional, Tuple
from typing import Callable, List, Optional, Tuple, Union

import numpy as np
import openvino as ov
Expand All @@ -35,6 +35,7 @@
@dataclass
class OVModelParameters:
input_dtype: TensorDataType
output_dtype: Optional[TensorDataType] = None
dynamic_shapes: bool = False
recompile: bool = False
release_memory: bool = True
Expand All @@ -56,30 +57,28 @@ def __hash__(self):
)


def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList:
# Returns results as numpy tensors
def run_model(
ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList, return_ov_tensors: bool
) -> TensorList:
if any(isinstance(it, Tensor) for it in inputs):
inputs = [inp.data for inp in inputs]
outputs = compiled_model(
inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
)
outputs = [Tensor(outputs[i]) for i in range(len(outputs))]
if ov_model_params.release_memory:
compiled_model.release_memory()
return outputs

if return_ov_tensors:
infer_request = compiled_model.create_infer_request()
infer_request.infer(
inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
)
outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))]
else:
outputs = compiled_model(
inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
)
outputs = [outputs[i] for i in range(len(outputs))]
outputs = [Tensor(it) for it in outputs]

def run_model_via_infer_request(
ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList
) -> TensorList:
# Returns results as ov tensors
if any(isinstance(it, Tensor) for it in inputs):
inputs = [inp.data for inp in inputs]
infer_request = compiled_model.create_infer_request()
infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))]
if ov_model_params.release_memory:
compiled_model.release_memory()

return outputs


Expand All @@ -93,8 +92,6 @@ def get_compress_weight_model(
) -> ModelCallable:
if scale_shape is None and zero_point_shape is not None:
raise Exception("Zero point shape can only be provided if scale shape is provided.")
# if (scale_shape is None) != (reduction_axes is not None):
# raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.")

if ov_model_params.dynamic_shapes:
weight_shape = (-1,) * len(weight_shape)
Expand All @@ -103,9 +100,6 @@ def get_compress_weight_model(
if zero_point_shape is not None:
zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)

if config.num_bits == 4:
ov_model_params.return_ov_tensors = True

return _build_compress_model(
config,
ov_model_params,
Expand Down Expand Up @@ -150,28 +144,29 @@ def _build_compress_model(
zero_point_shape: Optional[Tuple] = None,
reduction_axes: Optional[Tuple] = None,
return_nodes: bool = False,
) -> ModelCallable:
) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]:
weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype])
ov_parameters = [weight]

if scale_shape is not None:
# Compute only the compressed weight
mode = config.mode
asym_mode = mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
num_bits = config.num_bits
eps = np.finfo(np.float32).eps
if asym_mode:
level_low = 0
level_high = 2**num_bits - 1
else:
level_low = -(2 ** (num_bits - 1))
level_high = 2 ** (num_bits - 1) - 1

min_values = None
if scale_shape is not None:
# Scale is given as an input
scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32)
ov_parameters.append(scale)

zero_point = None
if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
ov_parameters.append(zero_point)
zero_point = opset.convert(zero_point, ov.Type.f32)
else:
# Compute compressed weight, scale and, possibly, zero point

mode = config.mode
num_bits = config.num_bits
eps = np.finfo(np.float32).eps
if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
# Compute scale
if asym_mode:
min_values = opset.reduce_min(
weight, reduction_axes=reduction_axes, keep_dims=True
) # [a1, r, a2] -> [a1, 1, a2]
Expand All @@ -180,49 +175,64 @@ def _build_compress_model(
) # [a1, r, a2] -> [a1, 1, a2]
min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32)

level_low = 0
level_high = 2**num_bits - 1
levels = level_high - level_low + 1
scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32)
scale = opset.select(opset.abs(scale) < eps, eps, scale)

zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
zero_point = opset.clamp(zero_point, level_low, level_high)
else:
zero_point = None
level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32)

w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True))
w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True)
w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32)

scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max)
scale /= level_high
scale /= opset.constant(level_high, ov.Type.f32)
scale = opset.select(opset.abs(scale) < eps, eps, scale)

zero_point = None
if zero_point_shape is not None:
# Zero point is given as an input
zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
ov_parameters.append(zero_point)
zero_point = opset.convert(zero_point, ov.Type.f32)
elif mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
# Compute zero point
if min_values is None:
min_values = opset.reduce_min(
weight, reduction_axes=reduction_axes, keep_dims=True
) # [a1, r, a2] -> [a1, 1, a2]
min_values = opset.convert(min_values, ov.Type.f32)

level_low = 0
level_high = 2**num_bits - 1
zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
zero_point = opset.clamp(zero_point, level_low, level_high)

if weight.get_element_type() != ov.Type.f32:
weight = opset.convert(weight, ov.Type.f32)
compressed_w = weight / scale

num_bits = config.num_bits
if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
level_low = 0
level_high = 2**num_bits - 1
if asym_mode:
if ov_model_params.output_dtype is not None:
dtype = OV_DTYPE_MAP[ov_model_params.output_dtype]
else:
dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
compressed_w += zero_point
elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
level_low = -(2 ** (num_bits - 1))
level_high = 2 ** (num_bits - 1) - 1
else:
raise Exception
if ov_model_params.output_dtype is not None:
dtype = OV_DTYPE_MAP[ov_model_params.output_dtype]
else:
dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4

compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high)
compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights")

ov_results = [compressed_w]
if len(ov_parameters) == 1:
ov_results.append(scale)
if len(ov_parameters) != 3:
# Two cases:
# 1. weight -> compressed_weight, scale, (zero_point)
# 2. weight, scale -> compressed_weight, (zero_point)
if len(ov_parameters) == 1:
ov_results.append(scale)

if zero_point is not None:
ov_results.append(opset.convert(zero_point, compressed_w.get_element_type()))

Expand All @@ -232,8 +242,7 @@ def _build_compress_model(
model = ov.Model(ov_results, ov_parameters)
compiled_model = ov.compile_model(model, device_name="CPU")

run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
return partial(run_fn, ov_model_params, compiled_model)
return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)


@cache_results(OV_MODEL_CACHE)
Expand All @@ -249,25 +258,32 @@ def _build_compress_decompress_model(
)

if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
if len(ov_results) == 1:
compressed_w = ov_results[0]
s, zp = ov_parameters[1], ov_parameters[2]
if len(ov_parameters) == 1:
# weight -> compressed_weight, scale, zero_point
compressed_w, scale, zero_point = ov_results
elif len(ov_parameters) == 2:
# weight, scale -> compressed_weight, zero_point
compressed_w, zero_point = ov_results
scale = ov_parameters[1]
else:
compressed_w, s, zp = ov_results
decompressed_w = (compressed_w - zp) * s
else:
if len(ov_results) == 1:
# weight, scale, zero_point -> compressed_weight
compressed_w = ov_results[0]
s = ov_parameters[1]
scale, zero_point = ov_parameters[1:]
decompressed_w = opset.convert(opset.convert(compressed_w, ov.Type.i32) - zero_point, ov.Type.f32) * scale
else:
if len(ov_parameters) == 1:
# weight -> compressed_weight, scale
compressed_w, scale = ov_results
else:
compressed_w, s = ov_results
decompressed_w = compressed_w * s
# weight, scale -> compressed_weight
compressed_w = ov_results[0]
scale = ov_parameters[1]
decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale

model = ov.Model([decompressed_w], ov_parameters)
compiled_model = ov.compile_model(model, device_name="CPU")

run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
return partial(run_fn, ov_model_params, compiled_model)
return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)


def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable:
Expand All @@ -283,5 +299,4 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dt
model = ov.Model([res], [arg])
compiled_model = ov.compile_model(model, device_name="CPU")

run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
return partial(run_fn, ov_model_params, compiled_model)
return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
34 changes: 28 additions & 6 deletions nncf/quantization/algorithms/weight_compression/weight_lowering.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,14 +479,23 @@ def do_int_quantization(
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape

asym_mode = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
if ov_model_params is None:
# ov_model_params = OVModelParameters(weight.dtype)
output_dtype = None
return_ov_tensors = False
if config.num_bits == 4:
if weight.backend == TensorBackend.ov:
return_ov_tensors = weight.backend == TensorBackend.ov
else:
output_dtype = TensorDataType.uint8 if asym_mode else TensorDataType.int8
ov_model_params = OVModelParameters(
weight.dtype,
input_dtype=weight.dtype,
output_dtype=output_dtype,
dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))),
recompile=bool(int(os.environ.get("RECOMPILE", "0"))),
release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))),
share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))),
return_ov_tensors=return_ov_tensors,
)

model = get_compress_weight_model(
Expand All @@ -499,14 +508,27 @@ def do_int_quantization(
)

if precomputed_scale is None:
compressed_weight, scale, zero_point = model([weight])
# weight -> compressed_weight, scale, (zero_point)
results = model([weight])
if asym_mode:
compressed_weight, scale, zero_point = results
else:
compressed_weight, scale = results
zero_point = None

# Scale is always in fp32 so there is no need to store it in ov.Tensor
if scale.backend == TensorBackend.ov:
scale = scale.to_backend(TensorBackend.numpy)
elif precomputed_zero_point is None and asym_mode:
# weight, scale -> compressed_weight, zero_point
compressed_weight, zero_point = model([weight, precomputed_scale])
scale = precomputed_scale
else:
inputs = [weight, precomputed_scale]
if precomputed_zero_point is not None:
inputs += [precomputed_zero_point]
inputs = (
[weight, precomputed_scale]
if precomputed_zero_point is None
else [weight, precomputed_scale, precomputed_zero_point]
)
compressed_weight = model(inputs)[0]
scale, zero_point = precomputed_scale, precomputed_zero_point

Expand Down
11 changes: 11 additions & 0 deletions nncf/results_caching.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
# Copyright (c) 2024 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect


Expand Down

0 comments on commit ca3447c

Please sign in to comment.