Skip to content

Commit

Permalink
Dynamic shapes WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Sep 3, 2024
1 parent f059285 commit 1c85732
Show file tree
Hide file tree
Showing 3 changed files with 217 additions and 12 deletions.
53 changes: 44 additions & 9 deletions nncf/openvino/quantization/compression_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Optional, Tuple

import numpy as np
import openvino as ov
from openvino.runtime import opset13 as opset

from nncf import CompressWeightsMode
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig


Expand All @@ -30,6 +32,16 @@ def get_compress_weight_primitive(
zero_point_shape: Optional[Tuple] = None,
invert_scale: Optional[bool] = False,
):
DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
if DYNAMIC_COMPRESSION:
weight_shape = (-1,) * len(weight_shape)
scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
if zero_point_shape is not None:
zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)

recompile = bool(int(os.environ.get("RECOMPILE", "0")))
if recompile:
return self._build_compress_model(config, weight_shape, scale_shape, zero_point_shape, invert_scale)
key = (config.mode, config.num_bits, weight_shape, scale_shape, invert_scale)
if zero_point_shape is not None:
key += (zero_point_shape,)
Expand All @@ -46,6 +58,16 @@ def get_compress_decompress_weight_primitive(
scale_shape: Tuple,
zero_point_shape: Optional[Tuple] = None,
):
DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
if DYNAMIC_COMPRESSION:
weight_shape = (-1,) * len(weight_shape)
scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
if zero_point_shape is not None:
zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)

recompile = bool(int(os.environ.get("RECOMPILE", "0")))
if recompile:
return self._build_compress_decompress_model(config, weight_shape, scale_shape, zero_point_shape)
key = (config.mode, config.num_bits, weight_shape, scale_shape)
if zero_point_shape is not None:
key += (zero_point_shape,)
Expand All @@ -64,35 +86,48 @@ def _build_compress_model(
invert_scale: Optional[bool] = False,
return_nodes: bool = False,
):
w = opset.parameter(weight_shape, name="w")
FP16_INPUT = bool(int(os.environ.get("FP16_INPUT", "0")))
INT8_OUTPUT = bool(int(os.environ.get("INT8_OUTPUT", "0")))
SHARE_OUTPUTS = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))

w = opset.parameter(weight_shape, name="w", dtype=np.float16 if FP16_INPUT else np.float32)
s = opset.parameter(scale_shape, name="s")
parameters = [w, s]
if invert_scale:
compressed_w = w * (1 / s)
else:
compressed_w = w / s

if FP16_INPUT:
w = opset.convert(w, ov.Type.f32)

compressed_w = w * (1 / s) if invert_scale else w / s

num_bits = config.num_bits
if zero_point_shape is not None:
if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
level_low = 0
level_high = 2**num_bits - 1

zp = opset.parameter(zero_point_shape, name="zp")
parameters.append(zp)
compressed_w += zp
else:
elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4
level_low = -(2 ** (num_bits - 1))
level_high = 2 ** (num_bits - 1) - 1
else:
raise Exception

result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights")

if INT8_OUTPUT:
result = opset.convert(result, dtype)

if return_nodes:
return parameters, result

model = ov.Model([result], parameters)

compiled_model = ov.compile_model(model, device_name="CPU")

return lambda parameters: compiled_model(parameters)[0]
return lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS)[0]

@staticmethod
def _build_compress_decompress_model(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from dataclasses import dataclass
from typing import Optional, Tuple

Expand Down Expand Up @@ -307,13 +308,16 @@ def calculate_quantized_weight(
if weight.backend == TensorBackend.numpy and not is_openvino_available():
log_once(logging.INFO, "Compression time may improve after installing OpenVINO")

if weight.backend == TensorBackend.numpy and is_openvino_available():
NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
if weight.backend == TensorBackend.numpy and is_openvino_available() and not NUMPY_COMPRESSION:
from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE

zero_point_shape = None if zero_point is None else zero_point.shape
compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive(
config, weight.shape, scale.shape, zero_point_shape
)

assert weight.data.flags["C_CONTIGUOUS"]
input_tensors = weight.data, scale.data
if zero_point is not None:
input_tensors += (zero_point.data,)
Expand All @@ -339,7 +343,8 @@ def calculate_quantized_weight(
compressed_weights = fns.clip(compressed_weights, level_low, level_high)

dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8
compressed_weights = compressed_weights.astype(dtype)
if compressed_weights.dtype != dtype:
compressed_weights = compressed_weights.astype(dtype)

return compressed_weights

Expand Down Expand Up @@ -405,7 +410,8 @@ def do_int_quantization(
assert config.is_integer(), "The function supports integer quantization only"
group_size = config.group_size

if weight.dtype != TensorDataType.float32:
FP16_INPUT = bool(int(os.environ.get("FP16_INPUT", "0")))
if weight.dtype != TensorDataType.float32 and not FP16_INPUT:
weight = weight.astype(TensorDataType.float32)

if group_size != -1:
Expand Down
164 changes: 164 additions & 0 deletions weight_compression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Copyright (c) 2024 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import gc
import os
import shutil
import time
from functools import partial
from pathlib import Path

import openvino as ov

import nncf
from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE
from tools.memory_monitor import MemoryMonitor
from tools.memory_monitor import MemoryType


def parse_arguments():
parser = argparse.ArgumentParser()

parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored")

parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved")

parser.add_argument("--numpy-compression", action="store_true", help="Enable numpy compression")

parser.add_argument("--dynamic-compression", action="store_true", help="Enable dynamic compression")

parser.add_argument("--fp16-input", action="store_true", help="Enable FP16 input mode")

parser.add_argument("--int8-output", action="store_true", help="Output in int8")

parser.add_argument("--recompile", action="store_true", help="Recompile model every time")

parser.add_argument("--share-outputs", action="store_true", help="Share outputs")

parser.add_argument("--save-model", action="store_true", help="Save compressed model")

return parser.parse_args()


def log(mm, fz, log_dir):
mm.save_memory_logs(
*mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else ""
)


def main(args):
model_path = Path(args.model_path)
log_dir = Path(args.log_dir)

numpy_compression = args.numpy_compression
dynamic_compression = args.dynamic_compression
fp16_input = args.fp16_input
int8_output = args.int8_output
recompile = args.recompile
share_outputs = args.share_outputs
save_model = args.save_model
if numpy_compression:
log_dir_suffix = "numpy"
else:
log_dir_suffix = "ov-dynamic" if dynamic_compression else "ov-static"
log_dir_suffix = f"{log_dir_suffix}_{('output-int8' if int8_output else 'output-fp32')}"
log_dir_suffix = f"{log_dir_suffix}_{('input-fp16' if fp16_input else 'input-fp32')}"
if recompile:
log_dir_suffix = f"{log_dir_suffix}_recompile"
if share_outputs:
log_dir_suffix = f"{log_dir_suffix}_share-outputs"

memory_monitors = []
for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]:
memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0))
memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix))
memory_monitors.append(memory_monitor)

core = ov.Core()
# core.set_property({"ENABLE_MMAP": "NO"})
model = core.read_model(model_path)

os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}"
os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}"
os.environ["FP16_INPUT"] = f"{int(fp16_input)}"
os.environ["INT8_OUTPUT"] = f"{int(int8_output)}"
os.environ["RECOMPILE"] = f"{int(recompile)}"
os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}"

start_time = time.perf_counter()
compressed_model = nncf.compress_weights(model)
compression_time = time.perf_counter() - start_time
print(f"Compression Time: {compression_time:.2f} sec.")

if save_model:
ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml")
for filepath in model_path.parent.glob("*.json"):
shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name))

del core
del model
del compressed_model
gc.collect()
time.sleep(0.5)

before_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
if OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache:
OV_COMPRESSION_PRIMITIVE_CACHE._compress_weight_model_cache.clear()
gc.collect()
time.sleep(memory_monitors[0].interval * 10)
after_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
else:
after_cache_deletion = before_cache_deletion
cache_size = before_cache_deletion - after_cache_deletion
print(f"Cache size: {cache_size:.2f} MiB")

time.sleep(memory_monitors[0].interval * 10)

leftover_memory = memory_monitors[2].get_data(True)[1][-1]
peak_memory = max(memory_monitors[2].get_data(True)[1])
print(f"Peak memory: {peak_memory:.2f} MiB")
print(f"Leftover memory: {leftover_memory:.2f} MiB")
print("Done")

csv_path = log_dir / "results.csv"
csv_exists = csv_path.exists()
csv_path.parent.mkdir(exist_ok=True, parents=True)
with open(csv_path, "a") as f:
if not csv_exists:
f.write(
"Model Path,"
"Numpy,"
"Submodel Type,"
"Input,Output,"
"Compression Time,"
"Peak Memory,"
"Cache Size,"
"Leftover Memory"
"\n"
)
f.write(
f"{model_path},"
f"{numpy_compression},"
f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'},"
f"{'-' if numpy_compression else 'FP16' if fp16_input else 'FP32'},"
f"{'-' if numpy_compression else 'INT8' if int8_output else 'FP32'},"
f"{compression_time:.2f},"
f"{peak_memory:.2f},"
f"{cache_size:.2f},"
f"{leftover_memory:.2f}"
f"\n"
)


if __name__ == "__main__":
args = parse_arguments()
main(args)

0 comments on commit 1c85732

Please sign in to comment.