-
Notifications
You must be signed in to change notification settings - Fork 840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update convert_and_optimize_asr.py #1659
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,139 @@ | ||
import argparse | ||
from pathlib import Path | ||
|
||
import openvino as ov | ||
from optimum.intel.openvino import OVModelForSpeechSeq2Seq | ||
from transformers import AutoProcessor | ||
from datasets import load_dataset | ||
|
||
from itertools import islice | ||
from typing import List, Any | ||
from openvino import Tensor | ||
import shutil | ||
import nncf | ||
from tqdm.notebook import tqdm | ||
import gc | ||
|
||
MODEL_NAME = "distil-whisper/distil-large-v2" | ||
|
||
# We use post-traning quantization with NNCF to quantize the mode, which contains the following steps: | ||
#1: Create a calibration dataset for quantization. | ||
#2: Run nncf.quantize to obtain quantized encoder and decoder models. | ||
#3: Serialize the INT8 model using openvino.save_model function. | ||
|
||
# Step1: Since we quantize whisper encoder and decoder separately, we need to prepare a calibration dataset for each of the models. | ||
# We define a InferRequestWrapper class that will intercept model inputs and collect them to a list. | ||
# Then we run model inference on some small amount of audio samples. Generally, increasing the calibration dataset size improves quantization quality. | ||
class InferRequestWrapper: | ||
def __init__(self, request, data_cache: List): | ||
self.request = request | ||
self.data_cache = data_cache | ||
|
||
def __call__(self, *args, **kwargs): | ||
self.data_cache.append(*args) | ||
return self.request(*args, **kwargs) | ||
|
||
def infer(self, inputs: Any = None, shared_memory: bool = False): | ||
self.data_cache.append(inputs) | ||
return self.request.infer(inputs, shared_memory) | ||
|
||
def start_async( | ||
self, | ||
inputs: Any = None, | ||
userdata: Any = None, | ||
share_inputs: bool = False, | ||
): | ||
self.data_cache.append(inputs) | ||
self.request.infer(inputs, share_inputs) | ||
|
||
def wait(self): | ||
pass | ||
|
||
def get_tensor(self, name: str): | ||
return Tensor(self.request.results[name]) | ||
|
||
def __getattr__(self, attr): | ||
if attr in self.__dict__: | ||
return getattr(self, attr) | ||
return getattr(self.request, attr) | ||
|
||
|
||
def collect_calibration_dataset(ov_model, calibration_dataset_size): | ||
# Overwrite model request properties, saving the original ones for restoring later | ||
original_encoder_request = ov_model.encoder.request | ||
original_decoder_with_past_request = ov_model.decoder_with_past.request | ||
encoder_calibration_data = [] | ||
decoder_calibration_data = [] | ||
ov_model.encoder.request = InferRequestWrapper(original_encoder_request, encoder_calibration_data) | ||
ov_model.decoder_with_past.request = InferRequestWrapper(original_decoder_with_past_request, | ||
decoder_calibration_data) | ||
|
||
calibration_dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True) | ||
for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc="Collecting calibration data", | ||
total=calibration_dataset_size): | ||
input_features = extract_input_features(sample) | ||
ov_model.generate(input_features) | ||
|
||
ov_model.encoder.request = original_encoder_request | ||
ov_model.decoder_with_past.request = original_decoder_with_past_request | ||
|
||
return encoder_calibration_data, decoder_calibration_data | ||
|
||
|
||
# Quantize Distil-Whisper encoder and decoder-with-past models. | ||
def quantize(ov_model, calibration_dataset_size, quantized_distil_model_path, ov_config, output_dir): | ||
if not quantized_distil_model_path.exists(): | ||
encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset( | ||
ov_model, calibration_dataset_size | ||
) | ||
print("Quantizing encoder") | ||
quantized_encoder = nncf.quantize( | ||
ov_model.encoder.model, | ||
nncf.Dataset(encoder_calibration_data), | ||
preset=nncf.QuantizationPreset.MIXED, | ||
subset_size=len(encoder_calibration_data), | ||
model_type=nncf.ModelType.TRANSFORMER, | ||
# Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search | ||
advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50) | ||
) | ||
ov.save_model(quantized_encoder, quantized_distil_model_path / "openvino_encoder_model.xml") | ||
del quantized_encoder | ||
del encoder_calibration_data | ||
gc.collect() | ||
|
||
print("Quantizing decoder with past") | ||
quantized_decoder_with_past = nncf.quantize( | ||
ov_model.decoder_with_past.model, | ||
nncf.Dataset(decoder_calibration_data), | ||
preset=nncf.QuantizationPreset.MIXED, | ||
subset_size=len(decoder_calibration_data), | ||
model_type=nncf.ModelType.TRANSFORMER, | ||
# Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search | ||
advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95) | ||
) | ||
ov.save_model(quantized_decoder_with_past, quantized_distil_model_path / "openvino_decoder_with_past_model.xml") | ||
del quantized_decoder_with_past | ||
del decoder_calibration_data | ||
gc.collect() | ||
|
||
def convert_asr_model(model_dir: Path) -> Path: | ||
# Copy the config file and the first-step-decoder manually | ||
shutil.copy(output_dir / "config.json", quantized_distil_model_path / "config.json") | ||
shutil.copy(output_dir / "openvino_decoder_model.xml", quantized_distil_model_path / "openvino_decoder_model.xml") | ||
shutil.copy(output_dir / "openvino_decoder_model.bin", quantized_distil_model_path / "openvino_decoder_model.bin") | ||
|
||
|
||
# Prepare input sample | ||
def extract_input_features(sample): | ||
processor = AutoProcessor.from_pretrained(MODEL_NAME) | ||
input_features = processor( | ||
sample["audio"]["array"], | ||
sampling_rate=sample["audio"]["sampling_rate"], | ||
return_tensors="pt", | ||
).input_features | ||
return input_features | ||
|
||
|
||
def convert_asr_model(use_quantization:bool, model_dir: Path) -> Path: | ||
""" | ||
Convert speech-to-text model | ||
|
||
|
@@ -16,25 +142,45 @@ def convert_asr_model(model_dir: Path) -> Path: | |
Returns: | ||
Path to exported model dir | ||
""" | ||
output_dir = model_dir / (MODEL_NAME.rsplit ("/")[-1] + "-FP16") | ||
|
||
# load model and convert it to OpenVINO | ||
model = OVModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME, export=True, compile=False) | ||
# change precision to FP16 | ||
model.half() | ||
# save model to disk | ||
model.save_pretrained(output_dir) | ||
output_dir = model_dir / (MODEL_NAME.rsplit ("/")[-1] + "-FP16") | ||
ov_config = {"CACHE_DIR": ""} | ||
|
||
if not output_dir.exists(): | ||
ov_model = OVModelForSpeechSeq2Seq.from_pretrained( | ||
MODEL_NAME, ov_config=ov_config, export=True, compile=False, load_in_8bit=False | ||
) | ||
ov_model.half() | ||
ov_model.save_pretrained(output_dir) | ||
else: | ||
ov_model = OVModelForSpeechSeq2Seq.from_pretrained( | ||
output_dir, ov_config=ov_config, compile=False | ||
) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't check if the model is converted. I don't assume one will convert to FP16 first and then to INT8. |
||
# export also processor | ||
asr_processor = AutoProcessor.from_pretrained(MODEL_NAME) | ||
|
||
|
||
if use_quantization: | ||
# Use post-training quantization of NNCF to quantize the ASR model | ||
|
||
CALIBRATION_DATASET_SIZE = 50 | ||
quantized_distil_model_path = model_dir / (MODEL_NAME.rsplit ("/")[-1] + "-INT8") | ||
ov_model.to("AUTO") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why AUTO device here? Shouldn't be CPU or nothing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Adrian, replaced codes with updated version, using Optimum-Intel for weights compression directly. Please help review. Thanks~ |
||
ov_model.compile() | ||
quantize(ov_model, CALIBRATION_DATASET_SIZE, quantized_distil_model_path, ov_config, output_dir) | ||
output_dir = quantized_distil_model_path | ||
|
||
# export also processor | ||
asr_processor.save_pretrained(output_dir) | ||
|
||
return Path(output_dir) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--use_quantization", default=True, help="Choose if to quantize the ASR model") | ||
parser.add_argument("--model_dir", type=str, default="model", help="Directory to place the model in") | ||
|
||
args = parser.parse_args() | ||
convert_asr_model(Path(args.model_dir)) | ||
|
||
convert_asr_model(args.use_quantization, Path(args.model_dir)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tqdm is causing some errors for me (expecting a notebook?)