From fbd774e753293a6c9a24628d95858a270b6c8c20 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 22 Jan 2024 18:29:10 +0400
Subject: [PATCH] fix vegart syntax error, update stable zephyr (#1634)

* fix vegart syntax error, update stable zephyr

* fix saving fp16 calibrated model on win

update jupyterlab (#1635)
---
 .docker/Pipfile.lock                          |   6 +-
 .../248-segmind-vegart.ipynb                  |   2 +-
 .../251-tiny-sd-image-generation.ipynb        |  20 +-
 .../273-stable-zephyr-3b-chatbot.ipynb        | 189 ++++++++----------
 4 files changed, 104 insertions(+), 113 deletions(-)

diff --git a/.docker/Pipfile.lock b/.docker/Pipfile.lock
index 01948adcdb3..0f6a3d0dd72 100644
--- a/.docker/Pipfile.lock
+++ b/.docker/Pipfile.lock
@@ -1562,12 +1562,12 @@
         },
         "jupyterlab": {
             "hashes": [
-                "sha256:9ebada41d52651f623c0c9f069ddb8a21d6848e4c887d8e5ddc0613166ed5c0b",
-                "sha256:9f6f8e36d543fdbcc3df961a1d6a3f524b4a4001be0327a398f68fa4e534107c"
+                "sha256:d1aec24712566bc25a36229788242778e498ca4088028e2f9aa156b8b7fdc8fc",
+                "sha256:536bf0e78723153a5016ca7efb88ed0ecd7070d3f1555d5b0e2770658f900a3c"
             ],
             "index": "pypi",
             "markers": "python_version >= '3.8'",
-            "version": "==4.0.9"
+            "version": "==4.0.11"
         },
         "jupyterlab-git": {
             "hashes": [
diff --git a/notebooks/248-stable-diffusion-xl/248-segmind-vegart.ipynb b/notebooks/248-stable-diffusion-xl/248-segmind-vegart.ipynb
index bb4d301e356..2b92bd5edfc 100644
--- a/notebooks/248-stable-diffusion-xl/248-segmind-vegart.ipynb
+++ b/notebooks/248-stable-diffusion-xl/248-segmind-vegart.ipynb
@@ -686,7 +686,7 @@
     "import time\n",
     "\n",
     "validation_size = 7\n",
-    "calibration_dataset = datasets.load_dataset(\"conceptual_captions\",, split=\"train\")\n",
+    "calibration_dataset = datasets.load_dataset(\"conceptual_captions\", split=\"train\")\n",
     "validation_data = []\n",
     "for batch in calibration_dataset:\n",
     "    prompt = batch[\"caption\"]\n",
diff --git a/notebooks/251-tiny-sd-image-generation/251-tiny-sd-image-generation.ipynb b/notebooks/251-tiny-sd-image-generation/251-tiny-sd-image-generation.ipynb
index 3b8ac1dd3f0..6330d073844 100644
--- a/notebooks/251-tiny-sd-image-generation/251-tiny-sd-image-generation.ipynb
+++ b/notebooks/251-tiny-sd-image-generation/251-tiny-sd-image-generation.ipynb
@@ -54,6 +54,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "%pip uninstall -y -q openvino-dev openvino openvino-nightly\n",
     "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu torch torchvision \"openvino-nightly\" \"diffusers>=0.18.0\" \"transformers>=4.30.2\" \"gradio\""
    ]
   },
@@ -910,7 +911,10 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "### Calibrate UNet for GPU inference\n",
@@ -922,12 +926,16 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [],
    "source": [
     "import pickle\n",
     "import urllib.request\n",
+    "import os\n",
     "\n",
     "# Fetch `model_upcast_utils` which helps to restore accuracy when inferred on GPU\n",
     "urllib.request.urlretrieve(\n",
@@ -949,10 +957,12 @@
     "    unet_model = partially_upcast_nodes_to_fp32(unet_model, example_input, upcast_ratio=0.7,\n",
     "                                                operation_types=[\"Convolution\"])\n",
     "\n",
-    "    import os\n",
+    "    ov.save_model(unet_model, UNET_OV_PATH.with_suffix(\"._tmp.xml\"))\n",
+    "    del unet_model\n",
     "    os.remove(UNET_OV_PATH)\n",
     "    os.remove(str(UNET_OV_PATH).replace(\".xml\", \".bin\"))\n",
-    "    ov.save_model(unet_model, UNET_OV_PATH)"
+    "    UNET_OV_PATH.with_suffix(\"._tmp.xml\").rename(UNET_OV_PATH)\n",
+    "    UNET_OV_PATH.with_suffix(\"._tmp.bin\").rename(UNET_OV_PATH.with_suffix('.bin'))\n"
    ]
   },
   {
@@ -961,7 +971,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "unet_model = core.compile_model(unet_model, device.value)"
+    "unet_model = core.compile_model(UNET_OV_PATH, device.value)"
    ]
   },
   {
diff --git a/notebooks/273-stable-zephyr-3b-chatbot/273-stable-zephyr-3b-chatbot.ipynb b/notebooks/273-stable-zephyr-3b-chatbot/273-stable-zephyr-3b-chatbot.ipynb
index 1d8dcef9cd9..42203f1600c 100644
--- a/notebooks/273-stable-zephyr-3b-chatbot/273-stable-zephyr-3b-chatbot.ipynb
+++ b/notebooks/273-stable-zephyr-3b-chatbot/273-stable-zephyr-3b-chatbot.ipynb
@@ -47,12 +47,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "342d2a0e-08cb-4a65-9164-98cb82f25d8c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "\u001b[33mWARNING: Skipping openvino-dev as it is not installed.\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n",
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
-    "%pip uninstall -q -y optimum-intel optimum\n",
     "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu -r ./openvino.genai/llm_bench/python/requirements.txt\n",
     "%pip uninstall -q -y openvino openvino-dev openvino-nightly\n",
     "%pip install -q openvino-nightly\n",
@@ -79,29 +90,25 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n",
-      "/home/ea/work/genai_env/lib/python3.8/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n",
-      "  return torch._C._cuda_getDeviceCount() > 0\n",
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
-      "[ INFO ] openvino runtime version: 2024.0.0-13826-b51c5c0a997\n",
-      "model.safetensors: 100%|███████████████████| 5.59G/5.59G [04:19<00:00, 21.6MB/s]\n",
-      "generation_config.json: 100%|██████████████████| 111/111 [00:00<00:00, 13.6kB/s]\n",
-      "tokenizer_config.json: 100%|████████████████| 5.21k/5.21k [00:00<00:00, 839kB/s]\n",
-      "tokenizer.json: 100%|██████████████████████| 2.11M/2.11M [00:01<00:00, 2.10MB/s]\n",
-      "special_tokens_map.json: 100%|██████████████████| 587/587 [00:00<00:00, 429kB/s]\n",
+      "2024-01-22 17:38:18.929022: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-01-22 17:38:18.969898: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-01-22 17:38:19.655976: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n",
+      "[ INFO ] openvino runtime version: 2024.0.0-14080-d0619edd211\n",
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
       "Using the export variant default. Available variants are:\n",
       "    - default: The default ONNX variant.\n",
-      "Using framework PyTorch: 2.1.2+cu121\n",
+      "Using framework PyTorch: 2.0.1+cu117\n",
       "Overriding 1 configuration item(s)\n",
       "\t- use_cache -> True\n",
-      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/9974c58a0ec4be4cd6f55e814a2a93b9cf163823/modeling_stablelm_epoch.py:106: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/d3cd371e290a92f653b4cd07c825cf9fa43c49c9/modeling_stablelm_epoch.py:106: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if seq_len > self.max_seq_len_cached:\n",
-      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/9974c58a0ec4be4cd6f55e814a2a93b9cf163823/modeling_stablelm_epoch.py:236: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/d3cd371e290a92f653b4cd07c825cf9fa43c49c9/modeling_stablelm_epoch.py:236: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):\n",
-      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/9974c58a0ec4be4cd6f55e814a2a93b9cf163823/modeling_stablelm_epoch.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/d3cd371e290a92f653b4cd07c825cf9fa43c49c9/modeling_stablelm_epoch.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):\n",
-      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/9974c58a0ec4be4cd6f55e814a2a93b9cf163823/modeling_stablelm_epoch.py:253: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/d3cd371e290a92f653b4cd07c825cf9fa43c49c9/modeling_stablelm_epoch.py:253: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):\n",
       "[ INFO ] Compress model weights to 4BIT_DEFAULT\n",
       "[ INFO ] Compression options:\n",
@@ -115,7 +122,7 @@
       "+--------------+---------------------------+-----------------------------------+\n",
       "| 4            | 91% (224 / 226)           | 100% (224 / 224)                  |\n",
       "+--------------+---------------------------+-----------------------------------+\n",
-      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m226/226\u001b[0m • \u001b[38;2;0;104;181m0:02:36\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m;0;104;181m0:00:01\u001b[0m181m0:00:08\u001b[0m\n",
+      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m226/226\u001b[0m • \u001b[38;2;0;104;181m0:04:15\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m;0;104;181m0:00:02\u001b[0m181m0:00:13\u001b[0m\n",
       "\u001b[?25h"
      ]
     }
@@ -148,38 +155,34 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n",
-      "/home/ea/work/genai_env/lib/python3.8/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n",
-      "  return torch._C._cuda_getDeviceCount() > 0\n",
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
+      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n",
+      "2024-01-22 17:43:38.099096: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-01-22 17:43:38.139649: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-01-22 17:43:38.777855: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
       "[ INFO ] ==SUCCESS FOUND==: use_case: text_gen, model_type: stable-zephyr-3b\n",
-      "[ INFO ] ov_config={'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '', 'NUM_STREAMS': '1'}\n",
-      "OPENVINO_TORCH_BACKEND_DEVICE=CPU\n",
-      "[ INFO ] model_path=stable-zephyr-3b/pytorch/dldt/compressed_weights/OV_FP16-4BIT_DEFAULT, openvino runtime version: 2024.0.0-13826-b51c5c0a997\n",
+      "[ INFO ] OV Config={'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '', 'NUM_STREAMS': '1'}\n",
+      "[ INFO ] OPENVINO_TORCH_BACKEND_DEVICE=CPU\n",
+      "[ INFO ] Model path=stable-zephyr-3b/pytorch/dldt/compressed_weights/OV_FP16-4BIT_DEFAULT, openvino runtime version: 2024.0.0-14080-d0619edd211\n",
+      "Provided model does not contain state. It may lead to sub-optimal performance.Please reexport model with updated OpenVINO version >= 2023.3.0 calling the `from_pretrained` method with original model and `export=True` parameter\n",
       "Compiling the model to CPU ...\n",
-      "[ INFO ] From pretrained time: 5.89s\n",
+      "[ INFO ] From pretrained time: 5.18s\n",
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "[ INFO ] num_iters=0, num_text_list=1\n",
-      "[ INFO ] input_text=Tell me story about cats\n",
-      "[ INFO ] Input token size:5, max_output_token_size:512\n",
+      "[ INFO ] Numbeams: 1, benchmarking iter nums(exclude warm-up): 0, prompt nums: 1\n",
+      "[ INFO ] [warm-up] Input text: Tell me story about cats\n",
       "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n",
-      "[ INFO ] [warm-up] Input token size: 5\n",
-      "[ INFO ] [warm-up] Output size: 290\n",
-      "[ INFO ] [warm-up] Infer count: 512\n",
-      "[ INFO ] [warm-up] Tokenization Time: 2.29ms\n",
-      "[ INFO ] [warm-up] Detokenization Time: 0.50ms\n",
-      "[ INFO ] [warm-up] Generation Time: 19.75s\n",
-      "[ INFO ] [warm-up] Latency: 68.09 ms/token\n",
-      "[ INFO ] [warm-up] Generated:\n",
-      "Tell me story about cats and dogs.\n",
-      "Once upon a time, in a small village, there lived a young girl named Lily. She had two pets, a cat named Mittens and a dog named Max. Mittens was very playful and loved to chase Max around the house. Max, on the other hand, was a bit timid and would often hide when Mittens came around.\n",
-      "One day, Mittens and Max were playing together in the backyard when a loud thunderstorm came suddenly. Mittens, being afraid of the thunder, ran inside the house, leaving Max behind. The rain was coming down hard, and Max was struggling to find his way back inside.\n",
-      "Lily, who was watching the storm from her bedroom, heard Max's cries and knew she had to help him. She ran down the stairs and found Max standing in the rain, looking lost. Lily knew just what to do. She picked up Max and carried him back to the house, where Mittens was waiting.\n",
-      "Mittens was relieved to see Max safe and sound, and the two of them snuggled up together on the couch for the rest of the storm. From that day on, Max was no longer afraid of Mittens, and the three of them became closer than ever before.\n",
-      "And that, my dear friends, is the story of Mittens, Max, and Lily, and how they overcame their fears and became a true family.<|endoftext|>\n",
-      "[ INFO ] [warm-up] Result MD5:['f5575487f181d7de8e4c095b39fa4180']\n",
-      "[ INFO ] [warm-up] First token latency: 1030.65 ms/token, other tokens latency: 64.68 ms/token, len of tokens: 290\n",
-      "[ INFO ] [warm-up] First token infer latency: 1021.36 ms/token, other tokens infer latency: 64.10 ms/token, len of tokens: 290\n"
+      "[ INFO ] [warm-up] Input token size: 5, Output size: 279, Infer count: 512, Tokenization Time: 2.88ms, Detokenization Time: 0.48ms, Generation Time: 18.57s, Latency: 66.55 ms/token\n",
+      "[ INFO ] [warm-up] First token latency: 1304.75 ms/token, other tokens latency: 62.02 ms/token, len of tokens: 279\n",
+      "[ INFO ] [warm-up] First infer latency: 1295.99 ms/infer, other infers latency: 61.45 ms/infer, inference count: 279\n",
+      "[ INFO ] [warm-up] Result MD5:['1275534c5906590ce297cf1059f24a90']\n",
+      "[ INFO ] [warm-up] Generated: Tell me story about cats and dogs.\n",
+      "Once upon a time, in a small village, there lived a young girl named Lily. She had two pets, a cat named Mittens and a dog named Max. Mittens was a beautiful black cat with green eyes, and Max was a big lovable golden retriever with a wagging tail.\n",
+      "One sunny day, Lily decided to take Mittens and Max for a walk in the nearby forest. As they were walking, they heard a loud barking sound. Suddenly, a pack of dogs appeared from the bushes, led by a big brown dog with a friendly smile.\n",
+      "Lily was scared and worried about her pets. She quickly remembered that she had a whistle that she used to train Max. She took a deep breath and blew the whistle.\n",
+      "Max, who was trained to respond to the whistle, ran towards Lily and the dogs. The big brown dog approached Lily and introduced himself as Buddy.\n",
+      "Lily was relieved and happy to see her pets safe and sound. She thanked Buddy for helping her and her pets.\n",
+      "From that day on, Lily and Buddy became good friends. They often went on walks in the forest, and Buddy even learned to stay and wait while Lily played with Mittens.\n",
+      "And so, Lily and Max and Buddy lived happily ever after, enjoying their time together in the forest.<|endoftext|>\n"
      ]
     }
    ],
@@ -213,15 +216,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n",
-      "/home/ea/work/genai_env/lib/python3.8/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n",
-      "  return torch._C._cuda_getDeviceCount() > 0\n",
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
-      "[ INFO ] openvino runtime version: 2024.0.0-13826-b51c5c0a997\n",
+      "2024-01-22 17:44:08.466565: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-01-22 17:44:08.505214: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-01-22 17:44:09.190945: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n",
+      "[ INFO ] openvino runtime version: 2024.0.0-14080-d0619edd211\n",
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
       "Using the export variant default. Available variants are:\n",
       "    - default: The default ONNX variant.\n",
-      "Using framework PyTorch: 2.1.2+cu121\n",
+      "Using framework PyTorch: 2.0.1+cu117\n",
       "The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.\n",
       "Overriding 1 configuration item(s)\n",
       "\t- use_cache -> True\n",
@@ -229,11 +233,10 @@
       "  if attention_mask.size(0) > 1:\n",
       "/home/ea/work/openvino_notebooks/notebooks/273-stable-zephyr-3b-chatbot/openvino.genai/llm_bench/python/utils/conversion_utils/better_transformer_patch.py:290: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if input_shape[-1] > 1:\n",
-      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/9974c58a0ec4be4cd6f55e814a2a93b9cf163823/modeling_stablelm_epoch.py:106: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/ea/.cache/huggingface/modules/transformers_modules/stabilityai/stable-zephyr-3b/d3cd371e290a92f653b4cd07c825cf9fa43c49c9/modeling_stablelm_epoch.py:106: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if seq_len > self.max_seq_len_cached:\n",
       "/home/ea/work/openvino_notebooks/notebooks/273-stable-zephyr-3b-chatbot/openvino.genai/llm_bench/python/utils/conversion_utils/better_transformer_patch.py:380: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):\n",
-
       "[ INFO ] Compress model weights to 4BIT_DEFAULT\n",
       "[ INFO ] Compression options:\n",
       "[ INFO ] {'mode': <CompressWeightsMode.INT4_SYM: 'int4_sym'>, 'group_size': 128}\n",
@@ -246,7 +249,7 @@
       "+--------------+---------------------------+-----------------------------------+\n",
       "| 4            | 91% (224 / 226)           | 100% (224 / 224)                  |\n",
       "+--------------+---------------------------+-----------------------------------+\n",
-      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m226/226\u001b[0m • \u001b[38;2;0;104;181m0:02:35\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m;0;104;181m0:00:01\u001b[0m181m0:00:08\u001b[0m\n",
+      "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m226/226\u001b[0m • \u001b[38;2;0;104;181m0:04:13\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m;0;104;181m0:00:02\u001b[0m181m0:00:13\u001b[0m\n",
       "\u001b[?25h"
      ]
     }
@@ -267,38 +270,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n",
-      "/home/ea/work/genai_env/lib/python3.8/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n",
-      "  return torch._C._cuda_getDeviceCount() > 0\n",
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
+      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n",
+      "2024-01-22 17:49:29.675469: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-01-22 17:49:29.715263: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-01-22 17:49:30.371670: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
       "[ INFO ] ==SUCCESS FOUND==: use_case: text_gen, model_type: stable-zephyr-3b-stateful\n",
-      "[ INFO ] ov_config={'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '', 'NUM_STREAMS': '1'}\n",
-      "OPENVINO_TORCH_BACKEND_DEVICE=CPU\n",
-      "[ INFO ] model_path=stable-zephyr-3b-stateful/pytorch/dldt/compressed_weights/OV_FP16-4BIT_DEFAULT, openvino runtime version: 2024.0.0-13826-b51c5c0a997\n",
+      "[ INFO ] OV Config={'PERFORMANCE_HINT': 'LATENCY', 'CACHE_DIR': '', 'NUM_STREAMS': '1'}\n",
+      "[ INFO ] OPENVINO_TORCH_BACKEND_DEVICE=CPU\n",
+      "[ INFO ] Model path=stable-zephyr-3b-stateful/pytorch/dldt/compressed_weights/OV_FP16-4BIT_DEFAULT, openvino runtime version: 2024.0.0-14080-d0619edd211\n",
       "Compiling the model to CPU ...\n",
-      "[ INFO ] From pretrained time: 5.70s\n",
+      "[ INFO ] From pretrained time: 5.38s\n",
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "[ INFO ] num_iters=0, num_text_list=1\n",
-      "[ INFO ] input_text=Tell me story about cats\n",
-      "[ INFO ] Input token size:5, max_output_token_size:512\n",
+      "[ INFO ] Numbeams: 1, benchmarking iter nums(exclude warm-up): 0, prompt nums: 1\n",
+      "[ INFO ] [warm-up] Input text: Tell me story about cats\n",
       "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n",
-      "[ INFO ] [warm-up] Input token size: 5\n",
-      "[ INFO ] [warm-up] Output size: 290\n",
-      "[ INFO ] [warm-up] Infer count: 512\n",
-      "[ INFO ] [warm-up] Tokenization Time: 1.99ms\n",
-      "[ INFO ] [warm-up] Detokenization Time: 0.46ms\n",
-      "[ INFO ] [warm-up] Generation Time: 16.35s\n",
-      "[ INFO ] [warm-up] Latency: 56.37 ms/token\n",
-      "[ INFO ] [warm-up] Generated:\n",
-      "Tell me story about cats and dogs.\n",
-      "Once upon a time, in a small village, there lived a young girl named Lily. She had two pets, a cat named Mittens and a dog named Max. Mittens was very playful and loved to chase Max around the house. Max, on the other hand, was a bit timid and would often hide when Mittens came around.\n",
-      "One day, Mittens and Max were playing together in the backyard when a loud thunderstorm came suddenly. Mittens, being afraid of the thunder, ran inside the house, leaving Max behind. The rain was coming down hard, and Max was struggling to find his way back inside.\n",
-      "Lily, who was watching the storm from her bedroom, heard Max's cries and knew she had to help him. She ran down the stairs and found Max standing in the rain, looking lost. Lily knew just what to do. She picked up Max and carried him back to the house, where Mittens was waiting.\n",
-      "Mittens was relieved to see Max safe and sound, and the two of them snuggled up together on the couch for the rest of the storm. From that day on, Max was no longer afraid of Mittens, and the three of them became closer than ever before.\n",
-      "And that, my dear friends, is the story of Mittens, Max, and Lily, and how they overcame their fears and became a true family.<|endoftext|>\n",
-      "[ INFO ] [warm-up] Result MD5:['f5575487f181d7de8e4c095b39fa4180']\n",
-      "[ INFO ] [warm-up] First token latency: 1074.80 ms/token, other tokens latency: 52.77 ms/token, len of tokens: 290\n",
-      "[ INFO ] [warm-up] First token infer latency: 1073.78 ms/token, other tokens infer latency: 52.15 ms/token, len of tokens: 290\n"
+      "[ INFO ] [warm-up] Input token size: 5, Output size: 279, Infer count: 512, Tokenization Time: 2.15ms, Detokenization Time: 0.46ms, Generation Time: 15.03s, Latency: 53.86 ms/token\n",
+      "[ INFO ] [warm-up] First token latency: 1266.67 ms/token, other tokens latency: 49.42 ms/token, len of tokens: 279\n",
+      "[ INFO ] [warm-up] First infer latency: 1265.67 ms/infer, other infers latency: 48.81 ms/infer, inference count: 279\n",
+      "[ INFO ] [warm-up] Result MD5:['1275534c5906590ce297cf1059f24a90']\n",
+      "[ INFO ] [warm-up] Generated: Tell me story about cats and dogs.\n",
+      "Once upon a time, in a small village, there lived a young girl named Lily. She had two pets, a cat named Mittens and a dog named Max. Mittens was a beautiful black cat with green eyes, and Max was a big lovable golden retriever with a wagging tail.\n",
+      "One sunny day, Lily decided to take Mittens and Max for a walk in the nearby forest. As they were walking, they heard a loud barking sound. Suddenly, a pack of dogs appeared from the bushes, led by a big brown dog with a friendly smile.\n",
+      "Lily was scared and worried about her pets. She quickly remembered that she had a whistle that she used to train Max. She took a deep breath and blew the whistle.\n",
+      "Max, who was trained to respond to the whistle, ran towards Lily and the dogs. The big brown dog approached Lily and introduced himself as Buddy.\n",
+      "Lily was relieved and happy to see her pets safe and sound. She thanked Buddy for helping her and her pets.\n",
+      "From that day on, Lily and Buddy became good friends. They often went on walks in the forest, and Buddy even learned to stay and wait while Lily played with Mittens.\n",
+      "And so, Lily and Max and Buddy lived happily ever after, enjoying their time together in the forest.<|endoftext|>\n"
      ]
     }
    ],
@@ -322,27 +320,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "id": "433988ee-bda7-4224-9bf5-b013de4fcd65",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ea/work/genai_env/lib/python3.8/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n",
-      "  return torch._C._cuda_getDeviceCount() > 0\n",
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from utils.ov_model_classes import register_normalized_configs\n",
     "from optimum.intel.openvino import OVModelForCausalLM\n",
@@ -351,7 +332,7 @@
     "# Load model into Optimum Interface\n",
     "register_normalized_configs()\n",
     "\n",
-    "ov_model = OVModelForCausalLM.from_pretrained(model_path, compile=False, config=AutoConfig.from_pretrained(stateful_model_path, trust_remote_code=True), stateful=True)"
+    "ov_model = OVModelForCausalLM.from_pretrained(stateful_model_path, compile=False, config=AutoConfig.from_pretrained(stateful_model_path, trust_remote_code=True), stateful=True)"
    ]
   },
   {