From 3856b9904e360c9e8a2239dd6f0aa294a6a2c3d9 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 29 Jan 2024 15:18:38 +0100 Subject: [PATCH] Import InferRequestWrapper from optimum-intel instead of re-defining it --- .../267-distil-whisper-asr.ipynb | 60 ++++--------------- 1 file changed, 12 insertions(+), 48 deletions(-) diff --git a/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb b/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb index 51761226ea1..a253e3c1997 100644 --- a/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb +++ b/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb @@ -924,7 +924,7 @@ "### Prepare calibration datasets\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "First step is to prepare calibration datasets for quantization. Since we quantize whisper encoder and decoder separately, we need to prepare a calibration dataset for each of the models. We define a `InferRequestWrapper` class that will intercept model inputs and collect them to a list. Then we run model inference on some small amount of audio samples. Generally, increasing the calibration dataset size improves quantization quality." + "First step is to prepare calibration datasets for quantization. Since we quantize whisper encoder and decoder separately, we need to prepare a calibration dataset for each of the models. We import an `InferRequestWrapper` class that will intercept model inputs and collect them to a list. Then we run model inference on some small amount of audio samples. Generally, increasing the calibration dataset size improves quantization quality." ] }, { @@ -946,44 +946,10 @@ "%%skip not $to_quantize.value\n", "\n", "from itertools import islice\n", - "from typing import List, Any\n", - "from openvino import Tensor\n", + "from optimum.intel.openvino.quantization import InferRequestWrapper\n", "\n", "\n", - "class InferRequestWrapper:\n", - " def __init__(self, request, data_cache: List):\n", - " self.request = request\n", - " self.data_cache = data_cache\n", - "\n", - " def __call__(self, *args, **kwargs):\n", - " self.data_cache.append(*args)\n", - " return self.request(*args, **kwargs)\n", - "\n", - " def infer(self, inputs: Any = None, shared_memory: bool = False):\n", - " self.data_cache.append(inputs)\n", - " return self.request.infer(inputs, shared_memory)\n", - "\n", - " def start_async(\n", - " self,\n", - " inputs: Any = None,\n", - " userdata: Any = None,\n", - " share_inputs: bool = False,\n", - " ):\n", - " self.data_cache.append(inputs)\n", - " self.request.infer(inputs, share_inputs)\n", - "\n", - " def wait(self):\n", - " pass\n", - "\n", - " def get_tensor(self, name: str):\n", - " return Tensor(self.request.results[name])\n", - "\n", - " def __getattr__(self, attr):\n", - " if attr in self.__dict__:\n", - " return getattr(self, attr)\n", - " return getattr(self.request, attr)\n", - "\n", - "def collect_calibration_dataset(ov_model, calibration_dataset_size):\n", + "def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n", " # Overwrite model request properties, saving the original ones for restoring later\n", " original_encoder_request = ov_model.encoder.request\n", " original_decoder_with_past_request = ov_model.decoder_with_past.request\n", @@ -1124,11 +1090,11 @@ "import nncf\n", "\n", "CALIBRATION_DATASET_SIZE = 50\n", - "quantized_distil_model_path = Path(f\"{model_path}_quantized\")\n", + "quantized_model_path = Path(f\"{model_path}_quantized\")\n", "\n", "\n", - "def quantize(ov_model, calibration_dataset_size):\n", - " if not quantized_distil_model_path.exists():\n", + "def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n", + " if not quantized_model_path.exists():\n", " encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(\n", " ov_model, calibration_dataset_size\n", " )\n", @@ -1136,13 +1102,12 @@ " quantized_encoder = nncf.quantize(\n", " ov_model.encoder.model,\n", " nncf.Dataset(encoder_calibration_data),\n", - " preset=nncf.QuantizationPreset.MIXED,\n", " subset_size=len(encoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50)\n", " )\n", - " ov.save_model(quantized_encoder, quantized_distil_model_path / \"openvino_encoder_model.xml\")\n", + " ov.save_model(quantized_encoder, quantized_model_path / \"openvino_encoder_model.xml\")\n", " del quantized_encoder\n", " del encoder_calibration_data\n", " gc.collect()\n", @@ -1151,23 +1116,22 @@ " quantized_decoder_with_past = nncf.quantize(\n", " ov_model.decoder_with_past.model,\n", " nncf.Dataset(decoder_calibration_data),\n", - " preset=nncf.QuantizationPreset.MIXED,\n", " subset_size=len(decoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95)\n", " )\n", - " ov.save_model(quantized_decoder_with_past, quantized_distil_model_path / \"openvino_decoder_with_past_model.xml\")\n", + " ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_with_past_model.xml\")\n", " del quantized_decoder_with_past\n", " del decoder_calibration_data\n", " gc.collect()\n", "\n", " # Copy the config file and the first-step-decoder manually\n", - " shutil.copy(model_path / \"config.json\", quantized_distil_model_path / \"config.json\")\n", - " shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_distil_model_path / \"openvino_decoder_model.xml\")\n", - " shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_distil_model_path / \"openvino_decoder_model.bin\")\n", + " shutil.copy(model_path / \"config.json\", quantized_model_path / \"config.json\")\n", + " shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_model_path / \"openvino_decoder_model.xml\")\n", + " shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_model_path / \"openvino_decoder_model.bin\")\n", "\n", - " quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_distil_model_path, ov_config=ov_config, compile=False)\n", + " quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, ov_config=ov_config, compile=False)\n", " quantized_ov_model.to(device.value)\n", " quantized_ov_model.compile()\n", " return quantized_ov_model\n",