diff --git a/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb b/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb index 7c901710fdb..51761226ea1 100644 --- a/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb +++ b/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb @@ -70,11 +70,72 @@ "## Load PyTorch model\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "The `AutoModelForSpeechSeq2Seq.from_pretrained` method is used for the initialization of PyTorch Whisper model using the transformers library. We will use the `distil-whisper/distil-large-v2` model as an example in this tutorial. The model will be downloaded once during first run and this process may require some time. More details about this model can be found in [model_card](https://huggingface.co/distil-whisper/distil-large-v2).\n", + "The `AutoModelForSpeechSeq2Seq.from_pretrained` method is used for the initialization of PyTorch Whisper model using the transformers library. By default, we will use the `distil-whisper/distil-large-v2` model as an example in this tutorial. The model will be downloaded once during first run and this process may require some time.\n", + "\n", + "You may also choose other models from [Distil-Whisper hugging face collection](https://huggingface.co/collections/distil-whisper/distil-whisper-models-65411987e6727569748d2eb6) such as `distil-whisper/distil-medium.en` or `distil-whisper/distil-small.en`. Models of the original Whisper architecture are also available, more on them [here](https://huggingface.co/openai).\n", "\n", "Preprocessing and post-processing are important in this model use. `AutoProcessor` class used for initialization `WhisperProcessor` is responsible for preparing audio input data for the model, converting it to Mel-spectrogram and decoding predicted output token_ids into string using tokenizer." ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import ipywidgets as widgets\n", + "\n", + "model_ids = {\n", + " \"Distil-Whisper\": [\n", + " \"distil-whisper/distil-large-v2\",\n", + " \"distil-whisper/distil-medium.en\",\n", + " \"distil-whisper/distil-small.en\"\n", + " ],\n", + " \"Whisper\": [\n", + " \"openai/whisper-large-v3\",\n", + " \"openai/whisper-large-v2\",\n", + " \"openai/whisper-large\",\n", + " \"openai/whisper-medium\",\n", + " \"openai/whisper-small\",\n", + " \"openai/whisper-base\",\n", + " \"openai/whisper-tiny\",\n", + " \"openai/whisper-medium.en\",\n", + " \"openai/whisper-small.en\",\n", + " \"openai/whisper-base.en\",\n", + " \"openai/whisper-tiny.en\",\n", + " ]\n", + "}\n", + "\n", + "model_type = widgets.Dropdown(\n", + " options=model_ids.keys(),\n", + " value=\"Distil-Whisper\",\n", + " description=\"Model type:\",\n", + " disabled=False,\n", + ")\n", + "\n", + "model_type" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "model_id = widgets.Dropdown(\n", + " options=model_ids[model_type.value],\n", + " value=model_ids[model_type.value][0],\n", + " description=\"Model:\",\n", + " disabled=False,\n", + ")\n", + "\n", + "model_id" + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "code", "execution_count": 2, @@ -97,12 +158,10 @@ "source": [ "from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq\n", "\n", - "distil_model_id = \"distil-whisper/distil-large-v2\"\n", + "processor = AutoProcessor.from_pretrained(model_id.value)\n", "\n", - "processor = AutoProcessor.from_pretrained(distil_model_id)\n", - "\n", - "pt_distil_model = AutoModelForSpeechSeq2Seq.from_pretrained(distil_model_id)\n", - "pt_distil_model.eval();" + "pt_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id.value)\n", + "pt_model.eval();" ] }, { @@ -196,7 +255,7 @@ "source": [ "import IPython.display as ipd\n", "\n", - "predicted_ids = pt_distil_model.generate(input_features)\n", + "predicted_ids = pt_model.generate(input_features)\n", "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", "\n", "display(ipd.Audio(sample[\"audio\"][\"array\"], rate=sample[\"audio\"][\"sampling_rate\"]))\n", @@ -255,17 +314,18 @@ "from pathlib import Path\n", "from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", "\n", - "distil_model_path = Path(distil_model_id.split(\"/\")[-1])\n", + "model_path = Path(model_id.value.replace('/', '_'))\n", + "ov_config = {\"CACHE_DIR\": \"\"}\n", "\n", - "if not distil_model_path.exists():\n", - " ov_distil_model = OVModelForSpeechSeq2Seq.from_pretrained(\n", - " distil_model_id, export=True, compile=False, load_in_8bit=False\n", + "if not model_path.exists():\n", + " ov_model = OVModelForSpeechSeq2Seq.from_pretrained(\n", + " model_id.value, ov_config=ov_config, export=True, compile=False, load_in_8bit=False\n", " )\n", - " ov_distil_model.half()\n", - " ov_distil_model.save_pretrained(distil_model_path)\n", + " ov_model.half()\n", + " ov_model.save_pretrained(model_path)\n", "else:\n", - " ov_distil_model = OVModelForSpeechSeq2Seq.from_pretrained(\n", - " distil_model_path, compile=False\n", + " ov_model = OVModelForSpeechSeq2Seq.from_pretrained(\n", + " model_path, ov_config=ov_config, compile=False\n", " )" ] }, @@ -352,8 +412,8 @@ } ], "source": [ - "ov_distil_model.to(device.value)\n", - "ov_distil_model.compile()" + "ov_model.to(device.value)\n", + "ov_model.compile()" ] }, { @@ -413,7 +473,7 @@ } ], "source": [ - "predicted_ids = ov_distil_model.generate(input_features)\n", + "predicted_ids = ov_model.generate(input_features)\n", "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", "\n", "display(ipd.Audio(sample[\"audio\"][\"array\"], rate=sample[\"audio\"][\"sampling_rate\"]))\n", @@ -499,8 +559,8 @@ } ], "source": [ - "perf_distil_torch = measure_perf(pt_distil_model, sample)\n", - "perf_distil_ov = measure_perf(ov_distil_model, sample)" + "perf_torch = measure_perf(pt_model, sample)\n", + "perf_ov = measure_perf(ov_model, sample)" ] }, { @@ -525,11 +585,9 @@ } ], "source": [ - "print(f\"Mean torch {distil_model_path.name} generation time: {perf_distil_torch:.3f}s\")\n", - "print(f\"Mean openvino {distil_model_path.name} generation time: {perf_distil_ov:.3f}s\")\n", - "print(\n", - " f\"Performance {distil_model_path.name} openvino speedup: {perf_distil_torch / perf_distil_ov:.3f}\"\n", - ")" + "print(f\"Mean torch {model_id.value} generation time: {perf_torch:.3f}s\")\n", + "print(f\"Mean openvino {model_id.value} generation time: {perf_ov:.3f}s\")\n", + "print(f\"Performance {model_id.value} openvino speedup: {perf_torch / perf_ov:.3f}\")" ] }, { @@ -541,154 +599,6 @@ "[back to top ⬆️](#Table-of-contents:)\n" ] }, - { - "cell_type": "markdown", - "id": "2f8fc7eb", - "metadata": {}, - "source": [ - "Since Distil-Whisper is optimized version of original OpenAI Whisper model, let's compare performance and check benefits of using it." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "5b5ba97b-539a-4aea-8f7d-1b5345a88c8c", - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-08T15:07:37.410074400Z", - "start_time": "2023-11-08T15:06:45.795886200Z" - } - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b270aec6d62e48178cc4d88e23969ab3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Measuring performance: 0%| | 0/10 [00:00