Skip to content

Commit

Permalink
INT4 performance gains
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Oct 11, 2024
1 parent 43967ab commit a151d99
Show file tree
Hide file tree
Showing 5 changed files with 231 additions and 194 deletions.
3 changes: 2 additions & 1 deletion nncf/openvino/graph/node_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def get_const_value(const_node: ov.Node) -> np.ndarray:
:return: The constant value.
"""
INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32")
if const_node.get_element_type() == ov.Type.bf16 and INPUT_DTYPE != "bf16":
NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
if const_node.get_element_type() == ov.Type.bf16 and (INPUT_DTYPE != "bf16" or NUMPY_COMPRESSION):
# Fixed FP32 data type as the result for BF16 constant
return const_node.get_data(dtype=np.float32)
return const_node.data
Expand Down
19 changes: 15 additions & 4 deletions nncf/openvino/quantization/compression_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,14 @@ def _get_compress_model(

num_bits = config.num_bits
if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
dtype = ov.Type.u8
# dtype = ov.Type.u8
dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
level_low = 0
level_high = 2**num_bits - 1
compressed_w += opset.convert(zp, ov.Type.f32)
elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
dtype = ov.Type.i8
# dtype = ov.Type.i8
dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
level_low = -(2 ** (num_bits - 1))
level_high = 2 ** (num_bits - 1) - 1
else:
Expand All @@ -282,9 +284,10 @@ def _get_compress_model(

results = [compressed_w]
if not output_only_weight:
s = opset.convert(s, ov.Type.f16)
results.append(s)
if zp is not None:
results.append(opset.convert(zp, ov.Type.i32))
results.append(opset.convert(zp, compressed_w.get_element_type()))
if return_nodes:
return parameters, results

Expand All @@ -293,7 +296,15 @@ def _get_compress_model(
compiled_model = ov.compile_model(model, device_name="CPU")

SHARE_OUTPUTS = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
return compiled_model, lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS)

def infer(inputs):
infer_request = compiled_model.create_infer_request()
infer_request.infer(inputs, share_outputs=SHARE_OUTPUTS)
outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))]
return outputs

# return compiled_model, lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS)
return compiled_model, infer

@staticmethod
def _get_compress_decompress_model(
Expand Down
39 changes: 28 additions & 11 deletions nncf/quantization/algorithms/weight_compression/openvino_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,21 +226,38 @@ def _create_compression_subgraph(
original_shape = weight.shape
compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points)

compressed_const = opset.constant(compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name)
compressed_weight_data = compressed_weight.tensor.data
if isinstance(compressed_weight_data, ov.Tensor):
compressed_const = opset.constant(compressed_weight_data, name=const_node_name)
else:
compressed_const = opset.constant(compressed_weight_data, dtype=compression_dtype, name=const_node_name)
if compressed_const.get_element_type() != compression_dtype:
compressed_const = opset.convert(compressed_const, compression_dtype)
converted_const = opset.convert(compressed_const, ov.Type.f16)
if compressed_weight.zero_point is not None and compressed_weight.tensor.dtype == TensorDataType.uint8:
zero_point_const = opset.constant(
compressed_weight.zero_point.data,
dtype=compression_dtype,
name=f"{const_node_name}/zero_point",
)
converted_zero_point = opset.convert(zero_point_const, ov.Type.f16)
if compressed_weight.zero_point is not None:
zero_point_data = compressed_weight.zero_point.data
if isinstance(zero_point_data, ov.Tensor):
zero_point_const = opset.constant(
compressed_weight.zero_point.data,
name=f"{const_node_name}/zero_point",
)
else:
zero_point_const = opset.constant(
compressed_weight.zero_point.data,
dtype=compression_dtype,
name=f"{const_node_name}/zero_point",
)
zero_point_const = opset.convert(zero_point_const, ov.Type.f16)
converted_const = opset.subtract(
converted_const, converted_zero_point, name=f"{const_node_name}/zero_point/subtract"
converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract"
)

scale_const = opset.constant(compressed_weight.scale.data, dtype=scale_dtype, name=f"{const_node_name}/scale")
if scale_dtype != ov.Type.f16:
scale_data = compressed_weight.scale.data
if isinstance(scale_data, ov.Tensor):
scale_const = opset.constant(scale_data, name=f"{const_node_name}/scale")
else:
scale_const = opset.constant(scale_data, dtype=scale_dtype, name=f"{const_node_name}/scale")
if scale_const.get_element_type() != ov.Type.f16:
scale_const = opset.convert(scale_const, ov.Type.f16)

mul = opset.multiply(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def calculate_quantized_weight(
compressed_weights = compressed_weights_ov

dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8
if compressed_weights.dtype != dtype:
if isinstance(compressed_weights.data, np.ndarray) and compressed_weights.dtype != dtype:
compressed_weights = compressed_weights.astype(dtype)
if scale_ov is not None:
scale, zero_point = scale_ov, zero_point_ov
Expand Down
Loading

0 comments on commit a151d99

Please sign in to comment.