openvinotoolkit · alexsu52 · Oct 21, 2024 · Aug 20, 2024 · Aug 26, 2024 · Aug 30, 2024
@@ -0,0 +1,31 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
+
+
+class FXBackendParameters:
+    COMPRESS_WEIGHTS = "compress_weights"
+
+
+def is_weight_compression_needed(advanced_parameters: Optional[AdvancedQuantizationParameters]) -> bool:
+    """
+    Determines whether weight compression is needed based on the provided
+    advanced quantization parameters.
+
+    :param advanced_parameters: Advanced quantization parameters.
+    :return: True if weight compression is needed, False otherwise.
+    """
+    if advanced_parameters is not None and advanced_parameters.backend_params is not None:
+        return advanced_parameters.backend_params.get(FXBackendParameters.COMPRESS_WEIGHTS, True)
+    return True
@@ -26,7 +26,10 @@
 from nncf.common.logging import nncf_logger
 from nncf.common.quantization.structs import QuantizationPreset
 from nncf.data import Dataset
+from nncf.experimental.torch.fx.quantization.backend_parameters import is_weight_compression_needed
 from nncf.experimental.torch.fx.transformations import apply_quantization_transformations
+from nncf.experimental.torch.fx.transformations import compress_post_quantize_transformation
+from nncf.experimental.torch.fx.transformations import fq_weights_transformation
 from nncf.experimental.torch.fx.transformations import revert_quantization_transformations
 from nncf.experimental.torch.fx.transformations import shared_constants_unification_transformation
 from nncf.parameters import BackupMode
@@ -94,6 +97,11 @@ def quantize_impl(
     # bias configuration.
     revert_quantization_transformations(quantized_model)
 
+    if is_weight_compression_needed(advanced_parameters):
+        compress_post_quantize_transformation(quantized_model)
+    else:
+        fq_weights_transformation(quantized_model)
+
     # Magic. Without this call compiled model
     # is not preformant
     quantized_model = GraphModule(quantized_model, quantized_model.graph)
@@ -107,6 +115,11 @@ def quantize_impl(
 
     quantized_model.meta.update(original_graph_meta)
     quantized_model = _disallow_eval_train(quantized_model)
+    # This is to ensure that the buffer of the model is updated.
+    # It was noted that every update or any transformation added
+    # another value in the buffer. This step removes the duplicate
+    # values in the buffer
+    quantized_model = GraphModule(quantized_model, quantized_model.graph)
 
     return quantized_model