Skip to content

Commit

Permalink
Tests WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Nov 1, 2024
1 parent d20e593 commit fe42f9a
Show file tree
Hide file tree
Showing 8 changed files with 377 additions and 43 deletions.
3 changes: 3 additions & 0 deletions nncf/quantization/algorithms/weight_compression/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ def is_integer(self):
def __hash__(self):
return hash((self.mode.value, self.group_size))

def __str__(self):
return f"{self.mode.value}_{self.group_size}"


@dataclass
class WeightCompressionParameters:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
from nncf.tensor import Tensor
from nncf.tensor.definitions import TensorBackend
Expand Down Expand Up @@ -127,6 +128,7 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.
def get_weight_dtype(
self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph
) -> TensorDataType:
# TODO: use from nncf.tensor.functions.ov import DTYPE_MAP
ov_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"]
dtype_map = {
"f16": TensorDataType.float16,
Expand Down Expand Up @@ -277,7 +279,6 @@ def _create_compression_subgraph(

if should_add_convert_node:
mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert")

return mul, compressed_weight

def transform_model(
Expand Down Expand Up @@ -344,6 +345,8 @@ def transform_model(
# reset name_to_node_mapping
self.name_to_node_mapping = None

OV_MODEL_CACHE.clear()

return model

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
class OVModelParameters:
input_dtype: TensorDataType
output_dtype: Optional[TensorDataType] = None
dynamic_shapes: bool = False
dynamic_shapes: bool = True # TODO: set to False once 156511 is resolved
recompile: bool = False
release_memory: bool = True
share_inputs: bool = True
Expand Down Expand Up @@ -124,7 +124,8 @@ def get_compress_decompress_weight_model(
) -> ModelCallable:
if ov_model_params.dynamic_shapes:
weight_shape = (-1,) * len(weight_shape)
scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
if scale_shape is not None:
scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
if zero_point_shape is not None:
zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)

Expand Down Expand Up @@ -223,19 +224,18 @@ def _build_compress_model(
else:
dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4

compressed_w = opset.round(compressed_w)
compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high)
compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights")

ov_results = [compressed_w]
if len(ov_parameters) != 3:
# Two cases:
# 1. weight -> compressed_weight, scale, (zero_point)
# 2. weight, scale -> compressed_weight, (zero_point)
if len(ov_parameters) == 1:
ov_results.append(scale)

if len(ov_parameters) == 1:
ov_results.append(scale)
if zero_point is not None:
ov_results.append(opset.convert(zero_point, compressed_w.get_element_type()))
zero_point_dtype = compressed_w.get_element_type() if ov_model_params.return_ov_tensors else ov.Type.i32
if zero_point.get_element_type() != zero_point_dtype:
zero_point = opset.convert(zero_point, zero_point_dtype)
ov_results.append(zero_point)

if return_nodes:
return ov_parameters, ov_results
Expand Down Expand Up @@ -264,18 +264,13 @@ def _build_compress_decompress_model(
if len(ov_parameters) == 1:
# weight -> compressed_weight, scale, zero_point
compressed_w, scale, zero_point = ov_results
elif len(ov_parameters) == 2:
# weight, scale -> compressed_weight, zero_point
compressed_w, zero_point = ov_results
scale = ov_parameters[1]
else:
# weight, scale, zero_point -> compressed_weight
compressed_w = ov_results[0]
scale, zero_point = ov_parameters[1:]

decompressed_w = scale * opset.convert(
opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32), ov.Type.f32
)
subtrac_zero_point = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
decompressed_w = scale * opset.convert(subtrac_zero_point, ov.Type.f32)
else:
if len(ov_parameters) == 1:
# weight -> compressed_weight, scale
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def apply(
graph: NNCFGraph,
statistic_points: Optional[StatisticPointsContainer] = None,
dataset: Optional[Dataset] = None,
) -> Dict[str, Tensor]:
) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
"""
Estimates better scale for the int4 nodes in the model.
Minimizes per-group difference between floating point MatMul and
Expand All @@ -122,10 +122,10 @@ def apply(
:param graph: Model graph.
:param statistic_points: Statistic points with collected statistics values.
:param dataset: A representative dataset for the calibration process.
:return: Dict with pairs (weight name, estimated scale).
:return: Two dictionaries for estimated scales and zero points for each weight name.
"""

scales = dict()
scales, zero_points = dict(), dict()

for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
weight_name = wp.weight_name
Expand All @@ -145,7 +145,7 @@ def apply(

weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)

scales[weight_name], _ = self.calculate_quantization_params(
scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
self._backend_entity,
stats,
weight,
Expand All @@ -157,7 +157,7 @@ def apply(
self._weight_penalty,
)

return scales
return scales, zero_points

@staticmethod
def calculate_quantization_params(
Expand Down Expand Up @@ -352,6 +352,8 @@ def calculate_quantization_params(

if config.group_size == -1:
result_scale = fns.squeeze(result_scale, axis=1)
if zp is not None and config.group_size == -1:
zp = fns.squeeze(zp, axis=1)

return result_scale, zp

Expand Down
34 changes: 22 additions & 12 deletions nncf/quantization/algorithms/weight_compression/weight_lowering.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,9 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=
return scale


def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division=False) -> Tensor:
def calculate_signed_scale(
weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division: Optional[bool] = True
) -> Tensor:
"""
Calculates the signed scale for symmetric quantization.
Expand Down Expand Up @@ -255,7 +257,10 @@ def calculate_normalized_weight_and_fp4_scale(


def calculate_integer_quantization_params(
weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
weight: Tensor,
reduction_axes: ReductionAxes,
config: WeightCompressionConfig,
invert_division: Optional[bool] = True,
) -> Tuple[Tensor, Tensor]:
"""
Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into
Expand Down Expand Up @@ -291,7 +296,7 @@ def calculate_quantized_weight(
config: WeightCompressionConfig,
scale: Tensor,
zero_point: Optional[Tensor] = None,
invert_division=False,
invert_division: Optional[bool] = True,
) -> Tensor:
"""
Quantizes the weight tensor using the provided scale and zero point.
Expand Down Expand Up @@ -327,7 +332,10 @@ def calculate_quantized_weight(


def get_integer_quantization_error(
weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
weight: Tensor,
reduction_axes: ReductionAxes,
config: WeightCompressionConfig,
invert_division: Optional[bool] = True,
) -> float:
"""
Calculates a quantity characterizing the difference between floating point weights and fake quantized
Expand Down Expand Up @@ -361,7 +369,7 @@ def compress_weight(
config: WeightCompressionConfig,
precomputed_scale: Tensor = None,
precomputed_zero_point: Tensor = None,
invert_division=False,
invert_division: Optional[bool] = True,
):
"""
Compress weight using compression configuration.
Expand Down Expand Up @@ -435,7 +443,7 @@ def do_int_quantization(
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Tensor = None,
precomputed_zero_point: Tensor = None,
invert_division: Optional[bool] = False,
invert_division: Optional[bool] = True,
ov_model_params: Optional = None,
):
"""
Expand All @@ -453,6 +461,11 @@ def do_int_quantization(
:return: A tuple containing the compressed weights, scale, and zero point tensors.
"""
assert config.is_integer, "The function supports integer quantization only"
if config.is_int_asym and (precomputed_scale is None) != (precomputed_zero_point is None):
raise ValueError(
"If precomputed quantization parameters are provided, both scale and zero point are required "
"for asymmetric quantization."
)

# import os
accelerate_through_ov = (
Expand Down Expand Up @@ -528,11 +541,8 @@ def do_int_quantization(
# Scale is always in fp32 so there is no need to store it in ov.Tensor
if scale.backend == TensorBackend.ov:
scale = scale.to_backend(TensorBackend.numpy)
elif precomputed_zero_point is None and config.is_int_asym:
# weight, scale -> compressed_weight, zero_point
compressed_weight, zero_point = model([weight, precomputed_scale])
scale = precomputed_scale
else:
# weight, scale, (zero_point) -> compressed_weight
inputs = (
[weight, precomputed_scale]
if precomputed_zero_point is None
Expand All @@ -550,7 +560,7 @@ def calculate_quantized_dequantized_weight(
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Optional[Tensor] = None,
precomputed_zero_point: Optional[Tensor] = None,
invert_division: Optional[bool] = False,
invert_division: Optional[bool] = True,
return_compressed_weight: Optional[bool] = False,
ov_model_params: Optional = None,
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
Expand Down Expand Up @@ -606,7 +616,7 @@ def calculate_quantized_dequantized_weight(
if precomputed_zero_point is not None:
inputs.append(precomputed_zero_point)

compressed_weight, scale, zero_point = None, None, None
compressed_weight, scale, zero_point = None, precomputed_scale, precomputed_zero_point
results = model(inputs)
if len(results) == 1:
decompressed_weight = results[0]
Expand Down
2 changes: 1 addition & 1 deletion nncf/quantization/fake_quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def calculate_scale_zero_point(
level_low: int,
level_high: int,
narrow_range: bool,
invert_division: Optional[bool] = False,
invert_division: Optional[bool] = True,
) -> Tuple[Tensor, Tensor]:
"""
Calculates scale and zero_point values for the quantizer.
Expand Down
27 changes: 20 additions & 7 deletions nncf/tensor/functions/ov.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from nncf.tensor.functions import numeric

from ..definitions import TensorBackend
from .numpy_numeric import DTYPE_MAP as NP_DTYPE_MAP
from .numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
from .numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP

DTYPE_MAP = {
TensorDataType.float16: ov.Type.f16,
Expand All @@ -40,7 +41,6 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model

a_dtype = DTYPE_MAP_REV[a.get_element_type()]
assert a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]

model = get_astype_model(
OVModelParameters(
Expand All @@ -65,11 +65,13 @@ def _(a: ov.Tensor) -> TensorBackend:

@numeric.astype.register(ov.Tensor)
def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
a_dtype = DTYPE_MAP_REV[a.get_element_type()]
if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]:
if a.get_element_type() in [ov.Type.bf16, ov.Type.i4, ov.Type.u4] or dtype in [
TensorDataType.bfloat16,
TensorDataType.int4,
TensorDataType.uint4,
]:
return _ov_astype(a, dtype)

return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype]))
return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype]))


@numeric.dtype.register(ov.Tensor)
Expand All @@ -87,8 +89,19 @@ def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor:
return ov.Tensor(a.data.reshape(shape), shape, a.get_element_type())


@numeric.to_backend.register(np.ndarray)
def _(a: np.ndarray, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
if b == TensorBackend.numpy:
return a
if b != TensorBackend.ov:
raise ValueError("Not supported backend")
return ov.Tensor(a, a.shape, DTYPE_MAP[DTYPE_MAP_REV_NP[a.dtype]])


@numeric.to_backend.register(ov.Tensor)
def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray:
def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
if b == TensorBackend.ov:
return a
if b != TensorBackend.numpy:
raise ValueError("Not supported backend")

Expand Down
Loading

0 comments on commit fe42f9a

Please sign in to comment.