Dynamic shapes WIP

openvinotoolkit · Sep 3, 2024 · 1c85732 · 1c85732
1 parent f059285
commit 1c85732
Show file tree

Hide file tree

Showing 3 changed files with 217 additions and 12 deletions.
diff --git a/nncf/openvino/quantization/compression_primitives.py b/nncf/openvino/quantization/compression_primitives.py
@@ -8,12 +8,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 from typing import Optional, Tuple
 
+import numpy as np
 import openvino as ov
 from openvino.runtime import opset13 as opset
 
+from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 
 
@@ -30,6 +32,16 @@ def get_compress_weight_primitive(
         zero_point_shape: Optional[Tuple] = None,
         invert_scale: Optional[bool] = False,
     ):
+        DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
+        if DYNAMIC_COMPRESSION:
+            weight_shape = (-1,) * len(weight_shape)
+            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
+            if zero_point_shape is not None:
+                zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
+
+        recompile = bool(int(os.environ.get("RECOMPILE", "0")))
+        if recompile:
+            return self._build_compress_model(config, weight_shape, scale_shape, zero_point_shape, invert_scale)
         key = (config.mode, config.num_bits, weight_shape, scale_shape, invert_scale)
         if zero_point_shape is not None:
             key += (zero_point_shape,)
@@ -46,6 +58,16 @@ def get_compress_decompress_weight_primitive(
         scale_shape: Tuple,
         zero_point_shape: Optional[Tuple] = None,
     ):
+        DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
+        if DYNAMIC_COMPRESSION:
+            weight_shape = (-1,) * len(weight_shape)
+            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
+            if zero_point_shape is not None:
+                zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
+
+        recompile = bool(int(os.environ.get("RECOMPILE", "0")))
+        if recompile:
+            return self._build_compress_decompress_model(config, weight_shape, scale_shape, zero_point_shape)
         key = (config.mode, config.num_bits, weight_shape, scale_shape)
         if zero_point_shape is not None:
             key += (zero_point_shape,)
@@ -64,35 +86,48 @@ def _build_compress_model(
         invert_scale: Optional[bool] = False,
         return_nodes: bool = False,
     ):
-        w = opset.parameter(weight_shape, name="w")
+        FP16_INPUT = bool(int(os.environ.get("FP16_INPUT", "0")))
+        INT8_OUTPUT = bool(int(os.environ.get("INT8_OUTPUT", "0")))
+        SHARE_OUTPUTS = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
+
+        w = opset.parameter(weight_shape, name="w", dtype=np.float16 if FP16_INPUT else np.float32)
         s = opset.parameter(scale_shape, name="s")
         parameters = [w, s]
-        if invert_scale:
-            compressed_w = w * (1 / s)
-        else:
-            compressed_w = w / s
+
+        if FP16_INPUT:
+            w = opset.convert(w, ov.Type.f32)
+
+        compressed_w = w * (1 / s) if invert_scale else w / s
+
         num_bits = config.num_bits
-        if zero_point_shape is not None:
+        if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
+            dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
             level_low = 0
             level_high = 2**num_bits - 1
 
             zp = opset.parameter(zero_point_shape, name="zp")
             parameters.append(zp)
             compressed_w += zp
-        else:
+        elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
+            dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4
             level_low = -(2 ** (num_bits - 1))
             level_high = 2 ** (num_bits - 1) - 1
+        else:
+            raise Exception
 
         result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights")
 
+        if INT8_OUTPUT:
+            result = opset.convert(result, dtype)
+
         if return_nodes:
             return parameters, result
 
         model = ov.Model([result], parameters)
 
         compiled_model = ov.compile_model(model, device_name="CPU")
 
-        return lambda parameters: compiled_model(parameters)[0]
+        return lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS)[0]
 
     @staticmethod
     def _build_compress_decompress_model(

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -9,6 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import os
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -307,13 +308,16 @@ def calculate_quantized_weight(
     if weight.backend == TensorBackend.numpy and not is_openvino_available():
         log_once(logging.INFO, "Compression time may improve after installing OpenVINO")
 
-    if weight.backend == TensorBackend.numpy and is_openvino_available():
+    NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    if weight.backend == TensorBackend.numpy and is_openvino_available() and not NUMPY_COMPRESSION:
         from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE
 
         zero_point_shape = None if zero_point is None else zero_point.shape
         compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive(
             config, weight.shape, scale.shape, zero_point_shape
         )
+
+        assert weight.data.flags["C_CONTIGUOUS"]
         input_tensors = weight.data, scale.data
         if zero_point is not None:
             input_tensors += (zero_point.data,)
@@ -339,7 +343,8 @@ def calculate_quantized_weight(
         compressed_weights = fns.clip(compressed_weights, level_low, level_high)
 
     dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8
-    compressed_weights = compressed_weights.astype(dtype)
+    if compressed_weights.dtype != dtype:
+        compressed_weights = compressed_weights.astype(dtype)
 
     return compressed_weights
 
@@ -405,7 +410,8 @@ def do_int_quantization(
     assert config.is_integer(), "The function supports integer quantization only"
     group_size = config.group_size
 
-    if weight.dtype != TensorDataType.float32:
+    FP16_INPUT = bool(int(os.environ.get("FP16_INPUT", "0")))
+    if weight.dtype != TensorDataType.float32 and not FP16_INPUT:
         weight = weight.astype(TensorDataType.float32)
 
     if group_size != -1:

diff --git a/weight_compression.py b/weight_compression.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import gc
+import os
+import shutil
+import time
+from functools import partial
+from pathlib import Path
+
+import openvino as ov
+
+import nncf
+from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE
+from tools.memory_monitor import MemoryMonitor
+from tools.memory_monitor import MemoryType
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored")
+
+    parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved")
+
+    parser.add_argument("--numpy-compression", action="store_true", help="Enable numpy compression")
+
+    parser.add_argument("--dynamic-compression", action="store_true", help="Enable dynamic compression")
+
+    parser.add_argument("--fp16-input", action="store_true", help="Enable FP16 input mode")
+
+    parser.add_argument("--int8-output", action="store_true", help="Output in int8")
+
+    parser.add_argument("--recompile", action="store_true", help="Recompile model every time")
+
+    parser.add_argument("--share-outputs", action="store_true", help="Share outputs")
+
+    parser.add_argument("--save-model", action="store_true", help="Save compressed model")
+
+    return parser.parse_args()
+
+
+def log(mm, fz, log_dir):
+    mm.save_memory_logs(
+        *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else ""
+    )
+
+
+def main(args):
+    model_path = Path(args.model_path)
+    log_dir = Path(args.log_dir)
+
+    numpy_compression = args.numpy_compression
+    dynamic_compression = args.dynamic_compression
+    fp16_input = args.fp16_input
+    int8_output = args.int8_output
+    recompile = args.recompile
+    share_outputs = args.share_outputs
+    save_model = args.save_model
+    if numpy_compression:
+        log_dir_suffix = "numpy"
+    else:
+        log_dir_suffix = "ov-dynamic" if dynamic_compression else "ov-static"
+        log_dir_suffix = f"{log_dir_suffix}_{('output-int8' if int8_output else 'output-fp32')}"
+        log_dir_suffix = f"{log_dir_suffix}_{('input-fp16' if fp16_input else 'input-fp32')}"
+        if recompile:
+            log_dir_suffix = f"{log_dir_suffix}_recompile"
+        if share_outputs:
+            log_dir_suffix = f"{log_dir_suffix}_share-outputs"
+
+    memory_monitors = []
+    for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]:
+        memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0))
+        memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix))
+        memory_monitors.append(memory_monitor)
+
+    core = ov.Core()
+    # core.set_property({"ENABLE_MMAP": "NO"})
+    model = core.read_model(model_path)
+
+    os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}"
+    os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}"
+    os.environ["FP16_INPUT"] = f"{int(fp16_input)}"
+    os.environ["INT8_OUTPUT"] = f"{int(int8_output)}"
+    os.environ["RECOMPILE"] = f"{int(recompile)}"
+    os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}"
+
+    start_time = time.perf_counter()
+    compressed_model = nncf.compress_weights(model)
+    compression_time = time.perf_counter() - start_time
+    print(f"Compression Time: {compression_time:.2f} sec.")
+
+    if save_model:
+        ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml")
+        for filepath in model_path.parent.glob("*.json"):
+            shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name))
+
+    del core
+    del model
+    del compressed_model
+    gc.collect()
+    time.sleep(0.5)
+
+    before_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
+    if OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache:
+        OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache.clear()
+        gc.collect()
+        time.sleep(memory_monitors[0].interval * 10)
+        after_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
+    else:
+        after_cache_deletion = before_cache_deletion
+    cache_size = before_cache_deletion - after_cache_deletion
+    print(f"Cache size: {cache_size:.2f} MiB")
+
+    time.sleep(memory_monitors[0].interval * 10)
+
+    leftover_memory = memory_monitors[2].get_data(True)[1][-1]
+    peak_memory = max(memory_monitors[2].get_data(True)[1])
+    print(f"Peak memory: {peak_memory:.2f} MiB")
+    print(f"Leftover memory: {leftover_memory:.2f} MiB")
+    print("Done")
+
+    csv_path = log_dir / "results.csv"
+    csv_exists = csv_path.exists()
+    csv_path.parent.mkdir(exist_ok=True, parents=True)
+    with open(csv_path, "a") as f:
+        if not csv_exists:
+            f.write(
+                "Model Path,"
+                "Numpy,"
+                "Submodel Type,"
+                "Input,Output,"
+                "Compression Time,"
+                "Peak Memory,"
+                "Cache Size,"
+                "Leftover Memory"
+                "\n"
+            )
+        f.write(
+            f"{model_path},"
+            f"{numpy_compression},"
+            f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'},"
+            f"{'-' if numpy_compression else 'FP16' if fp16_input else 'FP32'},"
+            f"{'-' if numpy_compression else 'INT8' if int8_output else 'FP32'},"
+            f"{compression_time:.2f},"
+            f"{peak_memory:.2f},"
+            f"{cache_size:.2f},"
+            f"{leftover_memory:.2f}"
+            f"\n"
+        )
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)