Skip to content

Commit

Permalink
Import InferRequestWrapper from optimum-intel instead of re-defining it
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Jan 29, 2024
1 parent 36fd474 commit 3856b99
Showing 1 changed file with 12 additions and 48 deletions.
60 changes: 12 additions & 48 deletions notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,7 @@
"### Prepare calibration datasets\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"First step is to prepare calibration datasets for quantization. Since we quantize whisper encoder and decoder separately, we need to prepare a calibration dataset for each of the models. We define a `InferRequestWrapper` class that will intercept model inputs and collect them to a list. Then we run model inference on some small amount of audio samples. Generally, increasing the calibration dataset size improves quantization quality."
"First step is to prepare calibration datasets for quantization. Since we quantize whisper encoder and decoder separately, we need to prepare a calibration dataset for each of the models. We import an `InferRequestWrapper` class that will intercept model inputs and collect them to a list. Then we run model inference on some small amount of audio samples. Generally, increasing the calibration dataset size improves quantization quality."
]
},
{
Expand All @@ -946,44 +946,10 @@
"%%skip not $to_quantize.value\n",
"\n",
"from itertools import islice\n",
"from typing import List, Any\n",
"from openvino import Tensor\n",
"from optimum.intel.openvino.quantization import InferRequestWrapper\n",
"\n",
"\n",
"class InferRequestWrapper:\n",
" def __init__(self, request, data_cache: List):\n",
" self.request = request\n",
" self.data_cache = data_cache\n",
"\n",
" def __call__(self, *args, **kwargs):\n",
" self.data_cache.append(*args)\n",
" return self.request(*args, **kwargs)\n",
"\n",
" def infer(self, inputs: Any = None, shared_memory: bool = False):\n",
" self.data_cache.append(inputs)\n",
" return self.request.infer(inputs, shared_memory)\n",
"\n",
" def start_async(\n",
" self,\n",
" inputs: Any = None,\n",
" userdata: Any = None,\n",
" share_inputs: bool = False,\n",
" ):\n",
" self.data_cache.append(inputs)\n",
" self.request.infer(inputs, share_inputs)\n",
"\n",
" def wait(self):\n",
" pass\n",
"\n",
" def get_tensor(self, name: str):\n",
" return Tensor(self.request.results[name])\n",
"\n",
" def __getattr__(self, attr):\n",
" if attr in self.__dict__:\n",
" return getattr(self, attr)\n",
" return getattr(self.request, attr)\n",
"\n",
"def collect_calibration_dataset(ov_model, calibration_dataset_size):\n",
"def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n",
" # Overwrite model request properties, saving the original ones for restoring later\n",
" original_encoder_request = ov_model.encoder.request\n",
" original_decoder_with_past_request = ov_model.decoder_with_past.request\n",
Expand Down Expand Up @@ -1124,25 +1090,24 @@
"import nncf\n",
"\n",
"CALIBRATION_DATASET_SIZE = 50\n",
"quantized_distil_model_path = Path(f\"{model_path}_quantized\")\n",
"quantized_model_path = Path(f\"{model_path}_quantized\")\n",
"\n",
"\n",
"def quantize(ov_model, calibration_dataset_size):\n",
" if not quantized_distil_model_path.exists():\n",
"def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n",
" if not quantized_model_path.exists():\n",
" encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(\n",
" ov_model, calibration_dataset_size\n",
" )\n",
" print(\"Quantizing encoder\")\n",
" quantized_encoder = nncf.quantize(\n",
" ov_model.encoder.model,\n",
" nncf.Dataset(encoder_calibration_data),\n",
" preset=nncf.QuantizationPreset.MIXED,\n",
" subset_size=len(encoder_calibration_data),\n",
" model_type=nncf.ModelType.TRANSFORMER,\n",
" # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n",
" advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50)\n",
" )\n",
" ov.save_model(quantized_encoder, quantized_distil_model_path / \"openvino_encoder_model.xml\")\n",
" ov.save_model(quantized_encoder, quantized_model_path / \"openvino_encoder_model.xml\")\n",
" del quantized_encoder\n",
" del encoder_calibration_data\n",
" gc.collect()\n",
Expand All @@ -1151,23 +1116,22 @@
" quantized_decoder_with_past = nncf.quantize(\n",
" ov_model.decoder_with_past.model,\n",
" nncf.Dataset(decoder_calibration_data),\n",
" preset=nncf.QuantizationPreset.MIXED,\n",
" subset_size=len(decoder_calibration_data),\n",
" model_type=nncf.ModelType.TRANSFORMER,\n",
" # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n",
" advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95)\n",
" )\n",
" ov.save_model(quantized_decoder_with_past, quantized_distil_model_path / \"openvino_decoder_with_past_model.xml\")\n",
" ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_with_past_model.xml\")\n",
" del quantized_decoder_with_past\n",
" del decoder_calibration_data\n",
" gc.collect()\n",
"\n",
" # Copy the config file and the first-step-decoder manually\n",
" shutil.copy(model_path / \"config.json\", quantized_distil_model_path / \"config.json\")\n",
" shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_distil_model_path / \"openvino_decoder_model.xml\")\n",
" shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_distil_model_path / \"openvino_decoder_model.bin\")\n",
" shutil.copy(model_path / \"config.json\", quantized_model_path / \"config.json\")\n",
" shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_model_path / \"openvino_decoder_model.xml\")\n",
" shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_model_path / \"openvino_decoder_model.bin\")\n",
"\n",
" quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_distil_model_path, ov_config=ov_config, compile=False)\n",
" quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, ov_config=ov_config, compile=False)\n",
" quantized_ov_model.to(device.value)\n",
" quantized_ov_model.compile()\n",
" return quantized_ov_model\n",
Expand Down

0 comments on commit 3856b99

Please sign in to comment.