diff --git a/nncf/openvino/quantization/compression_primitives.py b/nncf/openvino/quantization/compression_primitives.py index b13f73defd0..29284be2b7e 100644 --- a/nncf/openvino/quantization/compression_primitives.py +++ b/nncf/openvino/quantization/compression_primitives.py @@ -8,12 +8,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from typing import Optional, Tuple +import numpy as np import openvino as ov from openvino.runtime import opset13 as opset +from nncf import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig @@ -30,6 +32,16 @@ def get_compress_weight_primitive( zero_point_shape: Optional[Tuple] = None, invert_scale: Optional[bool] = False, ): + DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) + if DYNAMIC_COMPRESSION: + weight_shape = (-1,) * len(weight_shape) + scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) + if zero_point_shape is not None: + zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + + recompile = bool(int(os.environ.get("RECOMPILE", "0"))) + if recompile: + return self._build_compress_model(config, weight_shape, scale_shape, zero_point_shape, invert_scale) key = (config.mode, config.num_bits, weight_shape, scale_shape, invert_scale) if zero_point_shape is not None: key += (zero_point_shape,) @@ -46,6 +58,16 @@ def get_compress_decompress_weight_primitive( scale_shape: Tuple, zero_point_shape: Optional[Tuple] = None, ): + DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) + if DYNAMIC_COMPRESSION: + weight_shape = (-1,) * len(weight_shape) + scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) + if zero_point_shape is not None: + zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) + + recompile = bool(int(os.environ.get("RECOMPILE", "0"))) + if recompile: + return self._build_compress_decompress_model(config, weight_shape, scale_shape, zero_point_shape) key = (config.mode, config.num_bits, weight_shape, scale_shape) if zero_point_shape is not None: key += (zero_point_shape,) @@ -64,27 +86,40 @@ def _build_compress_model( invert_scale: Optional[bool] = False, return_nodes: bool = False, ): - w = opset.parameter(weight_shape, name="w") + FP16_INPUT = bool(int(os.environ.get("FP16_INPUT", "0"))) + INT8_OUTPUT = bool(int(os.environ.get("INT8_OUTPUT", "0"))) + SHARE_OUTPUTS = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) + + w = opset.parameter(weight_shape, name="w", dtype=np.float16 if FP16_INPUT else np.float32) s = opset.parameter(scale_shape, name="s") parameters = [w, s] - if invert_scale: - compressed_w = w * (1 / s) - else: - compressed_w = w / s + + if FP16_INPUT: + w = opset.convert(w, ov.Type.f32) + + compressed_w = w * (1 / s) if invert_scale else w / s + num_bits = config.num_bits - if zero_point_shape is not None: + if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: + dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 level_low = 0 level_high = 2**num_bits - 1 zp = opset.parameter(zero_point_shape, name="zp") parameters.append(zp) compressed_w += zp - else: + elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: + dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4 level_low = -(2 ** (num_bits - 1)) level_high = 2 ** (num_bits - 1) - 1 + else: + raise Exception result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") + if INT8_OUTPUT: + result = opset.convert(result, dtype) + if return_nodes: return parameters, result @@ -92,7 +127,7 @@ def _build_compress_model( compiled_model = ov.compile_model(model, device_name="CPU") - return lambda parameters: compiled_model(parameters)[0] + return lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS)[0] @staticmethod def _build_compress_decompress_model( diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index b7659c732de..83b5845103d 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import os from dataclasses import dataclass from typing import Optional, Tuple @@ -307,13 +308,16 @@ def calculate_quantized_weight( if weight.backend == TensorBackend.numpy and not is_openvino_available(): log_once(logging.INFO, "Compression time may improve after installing OpenVINO") - if weight.backend == TensorBackend.numpy and is_openvino_available(): + NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) + if weight.backend == TensorBackend.numpy and is_openvino_available() and not NUMPY_COMPRESSION: from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE zero_point_shape = None if zero_point is None else zero_point.shape compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive( config, weight.shape, scale.shape, zero_point_shape ) + + assert weight.data.flags["C_CONTIGUOUS"] input_tensors = weight.data, scale.data if zero_point is not None: input_tensors += (zero_point.data,) @@ -339,7 +343,8 @@ def calculate_quantized_weight( compressed_weights = fns.clip(compressed_weights, level_low, level_high) dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8 - compressed_weights = compressed_weights.astype(dtype) + if compressed_weights.dtype != dtype: + compressed_weights = compressed_weights.astype(dtype) return compressed_weights @@ -405,7 +410,8 @@ def do_int_quantization( assert config.is_integer(), "The function supports integer quantization only" group_size = config.group_size - if weight.dtype != TensorDataType.float32: + FP16_INPUT = bool(int(os.environ.get("FP16_INPUT", "0"))) + if weight.dtype != TensorDataType.float32 and not FP16_INPUT: weight = weight.astype(TensorDataType.float32) if group_size != -1: diff --git a/weight_compression.py b/weight_compression.py new file mode 100644 index 00000000000..6b68c6c0f4a --- /dev/null +++ b/weight_compression.py @@ -0,0 +1,164 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gc +import os +import shutil +import time +from functools import partial +from pathlib import Path + +import openvino as ov + +import nncf +from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE +from tools.memory_monitor import MemoryMonitor +from tools.memory_monitor import MemoryType + + +def parse_arguments(): + parser = argparse.ArgumentParser() + + parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored") + + parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved") + + parser.add_argument("--numpy-compression", action="store_true", help="Enable numpy compression") + + parser.add_argument("--dynamic-compression", action="store_true", help="Enable dynamic compression") + + parser.add_argument("--fp16-input", action="store_true", help="Enable FP16 input mode") + + parser.add_argument("--int8-output", action="store_true", help="Output in int8") + + parser.add_argument("--recompile", action="store_true", help="Recompile model every time") + + parser.add_argument("--share-outputs", action="store_true", help="Share outputs") + + parser.add_argument("--save-model", action="store_true", help="Save compressed model") + + return parser.parse_args() + + +def log(mm, fz, log_dir): + mm.save_memory_logs( + *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else "" + ) + + +def main(args): + model_path = Path(args.model_path) + log_dir = Path(args.log_dir) + + numpy_compression = args.numpy_compression + dynamic_compression = args.dynamic_compression + fp16_input = args.fp16_input + int8_output = args.int8_output + recompile = args.recompile + share_outputs = args.share_outputs + save_model = args.save_model + if numpy_compression: + log_dir_suffix = "numpy" + else: + log_dir_suffix = "ov-dynamic" if dynamic_compression else "ov-static" + log_dir_suffix = f"{log_dir_suffix}_{('output-int8' if int8_output else 'output-fp32')}" + log_dir_suffix = f"{log_dir_suffix}_{('input-fp16' if fp16_input else 'input-fp32')}" + if recompile: + log_dir_suffix = f"{log_dir_suffix}_recompile" + if share_outputs: + log_dir_suffix = f"{log_dir_suffix}_share-outputs" + + memory_monitors = [] + for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]: + memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0)) + memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix)) + memory_monitors.append(memory_monitor) + + core = ov.Core() + # core.set_property({"ENABLE_MMAP": "NO"}) + model = core.read_model(model_path) + + os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" + os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" + os.environ["FP16_INPUT"] = f"{int(fp16_input)}" + os.environ["INT8_OUTPUT"] = f"{int(int8_output)}" + os.environ["RECOMPILE"] = f"{int(recompile)}" + os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}" + + start_time = time.perf_counter() + compressed_model = nncf.compress_weights(model) + compression_time = time.perf_counter() - start_time + print(f"Compression Time: {compression_time:.2f} sec.") + + if save_model: + ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml") + for filepath in model_path.parent.glob("*.json"): + shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name)) + + del core + del model + del compressed_model + gc.collect() + time.sleep(0.5) + + before_cache_deletion = memory_monitors[2].get_data(True)[1][-1] + if OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache: + OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache.clear() + gc.collect() + time.sleep(memory_monitors[0].interval * 10) + after_cache_deletion = memory_monitors[2].get_data(True)[1][-1] + else: + after_cache_deletion = before_cache_deletion + cache_size = before_cache_deletion - after_cache_deletion + print(f"Cache size: {cache_size:.2f} MiB") + + time.sleep(memory_monitors[0].interval * 10) + + leftover_memory = memory_monitors[2].get_data(True)[1][-1] + peak_memory = max(memory_monitors[2].get_data(True)[1]) + print(f"Peak memory: {peak_memory:.2f} MiB") + print(f"Leftover memory: {leftover_memory:.2f} MiB") + print("Done") + + csv_path = log_dir / "results.csv" + csv_exists = csv_path.exists() + csv_path.parent.mkdir(exist_ok=True, parents=True) + with open(csv_path, "a") as f: + if not csv_exists: + f.write( + "Model Path," + "Numpy," + "Submodel Type," + "Input,Output," + "Compression Time," + "Peak Memory," + "Cache Size," + "Leftover Memory" + "\n" + ) + f.write( + f"{model_path}," + f"{numpy_compression}," + f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'}," + f"{'-' if numpy_compression else 'FP16' if fp16_input else 'FP32'}," + f"{'-' if numpy_compression else 'INT8' if int8_output else 'FP32'}," + f"{compression_time:.2f}," + f"{peak_memory:.2f}," + f"{cache_size:.2f}," + f"{leftover_memory:.2f}" + f"\n" + ) + + +if __name__ == "__main__": + args = parse_arguments() + main(args)