diff --git a/examples/post_training_quantization/openvino/yolov8/main.py b/examples/post_training_quantization/openvino/yolov8/main.py index 6c0aca78631..e69136db796 100644 --- a/examples/post_training_quantization/openvino/yolov8/main.py +++ b/examples/post_training_quantization/openvino/yolov8/main.py @@ -8,15 +8,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import os + +os.environ["TORCHINDUCTOR_FREEZING"] = "1" + import re import subprocess +import time +from copy import deepcopy from pathlib import Path from typing import Any, Dict, Tuple import numpy as np import openvino as ov +import openvino.torch # noqa import torch from torch._export import capture_pre_autograd_graph +from torch.fx.passes.graph_drawer import FxGraphDrawer from tqdm import tqdm from ultralytics.cfg import get_cfg from ultralytics.data.converter import coco80_to_coco91_class @@ -32,6 +41,36 @@ ROOT = Path(__file__).parent.resolve() +def measure_time(model, example_inputs, num_iters=500): + with torch.no_grad(): + model(*example_inputs) + total_time = 0 + for i in range(0, num_iters): + start_time = time.time() + model(*example_inputs) + total_time += time.time() - start_time + average_time = (total_time / num_iters) * 1000 + return average_time + + +def validate_fx( + model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator, num_samples: int = None +) -> Tuple[Dict, int, int]: + validator.seen = 0 + validator.jdict = [] + validator.stats = [] + validator.confusion_matrix = ConfusionMatrix(nc=validator.nc) + for batch_i, batch in enumerate(data_loader): + if num_samples is not None and batch_i == num_samples: + break + batch = validator.preprocess(batch) + preds = model(batch["img"]) + preds = validator.postprocess(preds) + validator.update_metrics(preds, batch) + stats = validator.get_stats() + return stats, validator.seen, validator.nt_per_class.sum() + + def validate( model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator, num_samples: int = None ) -> Tuple[Dict, int, int]: @@ -139,6 +178,66 @@ def transform_fn(data_item: Dict): return quantized_model +NNCF_QUANTIZATION = True + + +def quantize_impl(exported_model, val_loader, validator): + def transform_fn(x): + batch = validator.preprocess(x) + return batch["img"] + + calibration_dataset = nncf.Dataset(val_loader, transform_fn) + dir_name = str(Path(__file__).parent) + if NNCF_QUANTIZATION: + converted_model = nncf.quantize( + exported_model, + calibration_dataset, + ignored_scope=nncf.IgnoredScope( + types=["mul", "sub", "sigmoid"], + subgraphs=[ + nncf.Subgraph( + inputs=["cat_13", "cat_14", "cat_15"], + outputs=["output"], + ) + ], + ), + ) + g = FxGraphDrawer(converted_model, "yolo_nncf_fx_int8") + g.get_dot_graph().write_svg(dir_name + "/yolo_nncf_fx_int8.svg") + + quantized_model = torch.compile(converted_model, backend="openvino") + return quantized_model + else: + from torch.ao.quantization.quantize_pt2e import convert_pt2e + from torch.ao.quantization.quantize_pt2e import prepare_pt2e + from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer + from torch.ao.quantization.quantizer.x86_inductor_quantizer import get_default_x86_inductor_quantization_config + + quantizer = X86InductorQuantizer() + quantizer.set_global(get_default_x86_inductor_quantization_config()) + + prepared_model = prepare_pt2e(exported_model, quantizer) + + for idx, batch in tqdm(enumerate(calibration_dataset.get_inference_data())): + if idx >= 300: + break + prepared_model(batch) + + converted_model = convert_pt2e(prepared_model) + + g = FxGraphDrawer(prepared_model, "yolo_torch_fx_int8") + g.get_dot_graph().write_svg(dir_name + "/yolo_torch_fx_int8.svg") + import torch._inductor.config as config + + config.cpp_wrapper = True + + quantized_model = torch.compile(converted_model) + return quantized_model + + +TORCH_FX = True + + def main(): MODEL_NAME = "yolov8n" @@ -150,13 +249,39 @@ def main(): validator, data_loader = prepare_validation(model, args) # Convert to OpenVINO model + if TORCH_FX: + batch = next(iter(data_loader)) + batch = validator.preprocess(batch) + + with torch.no_grad(): + # fp_stats, total_images, total_object = validate(model.model, tqdm(data_loader), validator) + # print("Floating-point model validation results:") + # print_statistics(fp_stats, total_images, total_objects) + model.model.eval() + model.model(batch["img"]) + exported_model = capture_pre_autograd_graph(model.model, args=(batch["img"],)) + quantized_model = quantize_impl(deepcopy(exported_model), data_loader, validator) + + fp32_compiled_model = torch.compile(exported_model, backend="openvino") + fp32_stats, total_images, total_objects = validate_fx(fp32_compiled_model, tqdm(data_loader), validator) + # fp32_stats, total_images, total_objects = validate_fx(model.model, tqdm(data_loader), validator) + print("FP32 model validation results:") + print_statistics(fp32_stats, total_images, total_objects) + + int8_stats, total_images, total_objects = validate_fx(quantized_model, tqdm(data_loader), validator) + print("INT8 model validation results:") + print_statistics(int8_stats, total_images, total_objects) + + print("Start fp32 model benchmarking...") + fp32_latency = measure_time(fp32_compiled_model, (batch["img"],)) + print(f"fp32 latency: {fp32_latency}") + + print("Start int8 model benchmarking...") + int8_latency = measure_time(quantized_model, (batch["img"],)) + print(f"int8 latency: {int8_latency}") + print(f"Speed up: {fp32_latency / int8_latency}") + return - example_inputs = torch.ones((1, 3, 640, 640)) - # model.model = torch.compile(model.model) - # fx_model = model.export(format="torchscript") - with torch.no_grad(): - model.model.eval() - capture_pre_autograd_graph(model.model, (example_inputs,)) ov_model, ov_model_path = prepare_openvino_model(model, MODEL_NAME) # Quantize mode in OpenVINO representation diff --git a/nncf/common/hardware/configs/cpu.json b/nncf/common/hardware/configs/cpu.json index 4b39be807d9..0cd5290ae6d 100644 --- a/nncf/common/hardware/configs/cpu.json +++ b/nncf/common/hardware/configs/cpu.json @@ -64,6 +64,13 @@ "weights": ["q8_w_sym", "q8_w_asym"] } }, + { + "type": "Add", + "quantization": { + "activations": "q8_a", + "weights": ["q8_w_sym", "q8_w_asym"] + } + }, { "type": "Multiply", "quantization": { diff --git a/nncf/experimental/torch_fx/nncf_graph_builder.py b/nncf/experimental/torch_fx/nncf_graph_builder.py index 65bde1ffa26..c60b7533744 100644 --- a/nncf/experimental/torch_fx/nncf_graph_builder.py +++ b/nncf/experimental/torch_fx/nncf_graph_builder.py @@ -58,7 +58,6 @@ def _get_node_type_and_metatype(node: torch.fx.Node) -> Tuple[str, om.OperatorMe node_metatype = om.PTConstNoopMetatype elif node.op in ("call_function",): if hasattr(node.target, "overloadpacket"): - torch.nn.BatchNorm2d node_type = str(node.target.overloadpacket).split(".")[1] elif node.target.__name__ == "getitem": node_type = "__getitem__" @@ -66,6 +65,8 @@ def _get_node_type_and_metatype(node: torch.fx.Node) -> Tuple[str, om.OperatorMe # TODO: get correct nodes types from this nodes as well node_type = str(node.target) node_metatype = PT_OPERATOR_METATYPES.get_operator_metatype_by_op_name(node_type) + # if node_metatype is UnknownMetatype: + # breakpoint() # TODO: add layer attrs and support subtypes # if node_metatype.get_subtypes(): # subtype = node_metatype.determine_subtype( @@ -208,10 +209,10 @@ def get_module_params_or_buffers(): for source_node in model.graph.nodes: source_nncf_node = nncf_graph.get_node_by_name(source_node.name) - for dist_node in source_node.users: + for idx, dist_node in enumerate(source_node.users): dist_node_id = nncf_graph.get_node_by_name(dist_node.name).node_id input_port_id, output_port_id, tensor_shape = GraphConverter.get_edge_params( - model, source_node, source_nncf_node, dist_node + model, source_node, source_nncf_node, dist_node, idx ) nncf_graph.add_edge_between_nncf_nodes( @@ -226,14 +227,19 @@ def get_module_params_or_buffers(): return nncf_graph @staticmethod - def get_edge_params(model, source_node: torch.fx.Node, source_nncf_node: NNCFNode, dist_node: torch.fx.Node): - # TODO: support cat + def get_edge_params( + model, source_node: torch.fx.Node, source_nncf_node: NNCFNode, dist_node: torch.fx.Node, output_idx: int + ): output_port_id = 0 if source_node.op in ("get_attr",): tensor_shape = tuple(getattr(model, source_node.target).shape) elif "val" in source_node.meta: if source_nncf_node.metatype is om.PTBatchNormMetatype: tensor = source_node.meta["val"][0] + elif source_nncf_node.metatype is om.PTSplitMetatype: + tensor = source_node.meta["val"][output_idx] + # Assume every split outputs corresponds to an unique output_port_id + output_port_id = output_idx else: tensor = source_node.meta["val"] tensor_shape = tuple(tensor.shape) diff --git a/nncf/quantization/algorithms/min_max/torch_fx_backend.py b/nncf/quantization/algorithms/min_max/torch_fx_backend.py index 74b91f59d63..9b739e0e2c2 100644 --- a/nncf/quantization/algorithms/min_max/torch_fx_backend.py +++ b/nncf/quantization/algorithms/min_max/torch_fx_backend.py @@ -43,6 +43,7 @@ from nncf.torch.graph.transformations.commands import PTSharedFnInsertionCommand from nncf.torch.hardware.config import PTHWConfig from nncf.torch.nncf_network import NNCFNetwork +from nncf.torch.quantization.default_quantization import DEFAULT_PT_QUANT_TRAIT_TO_OP_DICT from nncf.torch.quantization.layers import QUANTIZATION_MODULES from nncf.torch.quantization.layers import AsymmetricQuantizer from nncf.torch.quantization.layers import BaseQuantizer @@ -118,6 +119,7 @@ def hw_config(self) -> HWConfig: @property def quant_trait_op_dict(self) -> Dict[int, OperatorMetatype]: + return DEFAULT_PT_QUANT_TRAIT_TO_OP_DICT return DEFAULT_FX_QUANT_TRAIT_TO_OP_DICT @staticmethod @@ -320,8 +322,11 @@ def create_unified_scales_quantizers_insertion_commands( ) # transformation = fake_quantize_insertion_tranformation_builder(quantizer, target_points) - transformation = qdq_insertion_tranformation_builder(quantizer, target_points) - return [FXApplyTransformationCommand(transformation)] + transformations = [] + for tp in target_points: + transformation = qdq_insertion_tranformation_builder(quantizer, [tp]) + transformations.append(FXApplyTransformationCommand(transformation)) + return transformations @staticmethod def get_ignored_metatypes(model_type: ModelType, device: TargetDevice) -> List[OperatorMetatype]: diff --git a/nncf/torch/graph/operator_metatypes.py b/nncf/torch/graph/operator_metatypes.py index c980658260d..a5e7f77c243 100644 --- a/nncf/torch/graph/operator_metatypes.py +++ b/nncf/torch/graph/operator_metatypes.py @@ -528,7 +528,7 @@ class PTGELUMetatype(PTOperatorMetatype): @PT_OPERATOR_METATYPES.register() class PTSILUMetatype(PTOperatorMetatype): name = "SiluOp" - module_to_function_names = {NamespaceTarget.TORCH_NN_FUNCTIONAL: ["silu"]} + module_to_function_names = {NamespaceTarget.TORCH_NN_FUNCTIONAL: ["silu"], NamespaceTarget.ATEN: ["silu_"]} @PT_OPERATOR_METATYPES.register() @@ -871,6 +871,7 @@ class PTSplitMetatype(PTOperatorMetatype): NamespaceTarget.TORCH_NN_FUNCTIONAL: [], NamespaceTarget.TORCH_TENSOR: ["split", "chunk", "unbind"], NamespaceTarget.TORCH: ["split", "chunk", "unbind"], + NamespaceTarget.ATEN: ["split_with_sizes"], } hw_config_names = [HWConfigOpName.SPLIT, HWConfigOpName.CHUNK] @@ -1036,7 +1037,10 @@ class PTSqrtMetatype(PTOperatorMetatype): @PT_OPERATOR_METATYPES.register() class PTInterpolateMetatype(PTOperatorMetatype): name = "InterpolateOp" - module_to_function_names = {NamespaceTarget.TORCH_NN_FUNCTIONAL: ["interpolate"]} + module_to_function_names = { + NamespaceTarget.TORCH_NN_FUNCTIONAL: ["interpolate"], + NamespaceTarget.ATEN: ["upsample_nearest2d", "upsample_nearest_exact2d"], + } hw_config_names = [HWConfigOpName.INTERPOLATE] num_expected_input_edges = 1