INT4 experiments

openvinotoolkit · Sep 26, 2024 · c9569bb · c9569bb
1 parent 43967ab
commit c9569bb
Show file tree

Hide file tree

Showing 5 changed files with 338 additions and 306 deletions.
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
@@ -116,7 +116,8 @@ def get_const_value(const_node: ov.Node) -> np.ndarray:
     :return: The constant value.
     """
     INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32")
-    if const_node.get_element_type() == ov.Type.bf16 and INPUT_DTYPE != "bf16":
+    NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    if const_node.get_element_type() == ov.Type.bf16 and (INPUT_DTYPE != "bf16" or NUMPY_COMPRESSION):
         # Fixed FP32 data type as the result for BF16 constant
         return const_node.get_data(dtype=np.float32)
     return const_node.data

diff --git a/nncf/openvino/quantization/compression_primitives.py b/nncf/openvino/quantization/compression_primitives.py
@@ -264,11 +264,13 @@ def _get_compress_model(
         num_bits = config.num_bits
         if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
             dtype = ov.Type.u8
+            # dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
             level_low = 0
             level_high = 2**num_bits - 1
             compressed_w += opset.convert(zp, ov.Type.f32)
         elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
             dtype = ov.Type.i8
+            # dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
             level_low = -(2 ** (num_bits - 1))
             level_high = 2 ** (num_bits - 1) - 1
         else:

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -8,6 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from typing import Dict, Iterable, List, Optional, Tuple
 
 import openvino as ov
@@ -226,6 +227,15 @@ def _create_compression_subgraph(
         original_shape = weight.shape
         compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points)
 
+        # NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+        # FP32_OUTPUT = bool(int(os.environ.get("FP32_OUTPUT", "0")))
+        # if compression_config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM] and not NUMPY_COMPRESSION and not FP32_OUTPUT:
+        #     flat_compressed_weight = ov.Tensor(compressed_weight.tensor.data, (compressed_weight.tensor.shape[0]*2, ), compression_dtype)
+        #     compressed_const = opset.constant(flat_compressed_weight)
+        #     compressed_shape = compressed_weight.scale.shape[:2] + (compression_config.group_size,)
+        #     compressed_const = opset.reshape(compressed_const, compressed_shape)
+        # else:
+        #     compressed_const = opset.constant(compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name)
         compressed_const = opset.constant(compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name)
         converted_const = opset.convert(compressed_const, ov.Type.f16)
         if compressed_weight.zero_point is not None and compressed_weight.tensor.dtype == TensorDataType.uint8:

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -350,15 +350,27 @@ def calculate_quantized_weight(
     ov_compression = weight.backend in [TensorBackend.numpy, TensorBackend.ov] and is_openvino_available() and not NUMPY_COMPRESSION
     compressed_weights_ov, scale_ov, zero_point_ov = None, None, None
     if ov_compression:
+        import openvino as ov
         from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE
 
+        # if INPUT_DTYPE == "bf16":
+        #     assert weight.dtype == TensorDataType.float16
+        #     input_dtype = ov.Type.bf16
+        # elif weight.dtype == TensorDataType.float16:
+        #     input_dtype = ov.Type.f16
+        # elif weight.dtype == TensorDataType.float32:
+        #     input_dtype = ov.Type.f32
+        # else:
+        #     raise Exception
+        # input_tensors = (ov.Tensor(weight.data, weight.data.shape, input_dtype),)
+
         if INPUT_DTYPE == "bf16":
-            import openvino as ov
             assert weight.data.dtype == np.float16
             weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16)
         else:
             weight_data = weight.data
         input_tensors = (weight_data,)
+
         if not END_TO_END_COMPRESSION:
             zero_point_shape = None if zero_point is None else zero_point.shape
             compiled_model, compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive(