diff --git a/notebooks/phi-3-vision/README.md b/notebooks/phi-3-vision/README.md
index 496d3c59930..4b42645c1b6 100644
--- a/notebooks/phi-3-vision/README.md
+++ b/notebooks/phi-3-vision/README.md
@@ -2,7 +2,7 @@
The [Phi-3-Vision-128K-Instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures. More details about model can be found in [model blog post](https://azure.microsoft.com/en-us/blog/new-models-added-to-the-phi-3-family-available-on-microsoft-azure/), [technical report](https://aka.ms/phi3-tech-report), [Phi-3-cookbook](https://github.com/microsoft/Phi-3CookBook)
-In this tutorial we consider how to launch Phi-3-vision using OpenVINO for creation multimodal chatbot. Additionally, we optimize model to low precision using [NNCF](https://github.com/openvinotoolkit/nncf)
+In this tutorial we consider how to use Phi-3-Vision model to build multimodal chatbot using [Optimum Intel](https://github.com/huggingface/optimum-intel). Additionally, we optimize model to low precision using [NNCF](https://github.com/openvinotoolkit/nncf)
## Notebook contents
The tutorial consists from following steps:
diff --git a/notebooks/phi-3-vision/ov_phi3_vision_helper.py b/notebooks/phi-3-vision/ov_phi3_vision_helper.py
deleted file mode 100644
index 97393bd5787..00000000000
--- a/notebooks/phi-3-vision/ov_phi3_vision_helper.py
+++ /dev/null
@@ -1,593 +0,0 @@
-from pathlib import Path
-import types
-from typing import Optional, Tuple, Union, List
-import gc
-import openvino as ov
-from openvino.runtime import opset13
-import nncf
-import numpy as np
-import torch
-from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
-from transformers.generation import GenerationConfig, GenerationMixin
-from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast
-
-
-def model_has_state(ov_model: ov.Model):
- return len(ov_model.get_sinks()) > 0
-
-
-def model_has_input_output_name(ov_model: ov.Model, name: str):
- """
- Helper function for checking that model has specified input or output name
-
- Parameters:
- ov_model (ov.Model):
- name (str):
- name of input or output
-
- Returns:
- True if input or output with requested name exists else False
- """
- return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], [])
-
-
-def fuse_cache_reorder(
- ov_model: ov.Model,
- not_kv_inputs: List[str],
- key_value_input_names: List[str],
- gather_dim: int,
-):
- """
- Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly.
-
- Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
- Should be run before make_stateful. Implements optimumum's _reorder_cache
- inside the model in the beginning of each iteration.
- Gather works along given gather_dim dimension that may vary from model to model.
- KV-cache inputs are identified based on names in key_value_input_names.
- Append the new beam_idx parameter to not_kv_inputs.
-
- Parameters:
- ov_model (`ov.Model`):
- openvino model for processing
- not_kv_inputs (`List[str]`):
- list of input nodes in model that not related to past key values
- key_value_input_names (`List[str]`):
- list of names for key value input layers
- gather_dim (int):
- dimension for gathering cache during reorder pass
- """
-
- if model_has_input_output_name(ov_model, "beam_idx"):
- raise ValueError("Model already has fused cache")
- input_batch = ov_model.input("inputs_embeds").get_partial_shape()[0]
- beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
- beam_idx.output(0).get_tensor().add_names({"beam_idx"}) # why list is not accepted?
- ov_model.add_parameters([beam_idx])
- not_kv_inputs.append(ov_model.inputs[-1])
- # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
- for input_name in key_value_input_names:
- parameter_output_port = ov_model.input(input_name)
- consumers = parameter_output_port.get_target_inputs()
- gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim))
- for consumer in consumers:
- consumer.replace_source_output(gather.output(0))
- ov_model.validate_nodes_and_infer_types()
-
-
-def build_state_initializer(ov_model: ov.Model, batch_dim: int):
- """
- Build initialization ShapeOf Expression for all ReadValue ops
-
- Parameters:
- ov_model (ov.Model):
- openvino model
- batch_dim (int):
- index of dimension corresponding to batch size
- """
- input_ids = ov_model.input("inputs_embeds")
- batch = opset13.gather(
- opset13.shape_of(input_ids, output_type="i64"),
- opset13.constant([0]),
- opset13.constant(0),
- )
- for op in ov_model.get_ops():
- if op.get_type_name() == "ReadValue":
- dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]
- dims[batch_dim] = batch
- dims = [(opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims]
- shape = opset13.concat(dims, axis=0)
- broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape)
- op.set_arguments([broadcast])
- ov_model.validate_nodes_and_infer_types()
-
-
-def make_stateful(
- ov_model: ov.Model,
- not_kv_inputs: List[str],
- key_value_input_names: List[str],
- key_value_output_names: List[str],
- batch_dim: int,
- num_attention_heads: int,
- num_beams_and_batch: int = None,
-):
- """
- Hides kv-cache inputs and outputs inside the model as variables.
-
- Parameters:
- ov_model (ov.Model):
- openvino model
- not_kv_inputs (`List[str]`):
- list of input nodes in model that not related to past key values
- key_value_input_names (`List[str]`):
- list of names for key value input layers
- key_value_output_names (`List[str]`):
- list of names for key value input layers
- batch_dim (int):
- index of batch dimension in key value layers
- num_attention_heads (int):
- number of attention heads for batch dimension initialization
- num_beams_an_batch (int):
- precalculated number of beams and batch for shapes initialization
- """
- from openvino._offline_transformations import apply_make_stateful_transformation
-
- input_output_map = {}
-
- if num_beams_and_batch is not None:
- # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue
- for input in not_kv_inputs:
- shape = input.get_partial_shape()
- if shape.rank.get_length() <= 2: # == 1 for beam_index
- shape[0] = num_beams_and_batch
- input.get_node().set_partial_shape(shape)
- for kv_name_pair in zip(key_value_input_names, key_value_output_names):
- input_output_map[kv_name_pair[0]] = kv_name_pair[1]
- if num_beams_and_batch is not None:
- input = ov_model.input(kv_name_pair[0])
- shape = input.get_partial_shape()
- shape[batch_dim] = num_beams_and_batch * num_attention_heads
- input.get_node().set_partial_shape(shape)
-
- if num_beams_and_batch is not None:
- # Re-validation model if shapes are altered above
- ov_model.validate_nodes_and_infer_types()
-
- apply_make_stateful_transformation(ov_model, input_output_map)
- if num_beams_and_batch is None:
- build_state_initializer(ov_model, batch_dim)
-
-
-def patch_stateful(ov_model):
- key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]]
- key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]]
- not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())]
- if not key_value_input_names or not key_value_output_names:
- return
- batch_dim = 0
- num_attention_heads = 1
-
- fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
- make_stateful(
- ov_model,
- not_kv_inputs,
- key_value_input_names,
- key_value_output_names,
- batch_dim,
- num_attention_heads,
- None,
- )
-
-
-core = ov.Core()
-
-
-def cleanup_torchscript_cache():
- """
- Helper for removing cached model representation
- """
- torch._C._jit_clear_class_registry()
- torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
- torch.jit._state._clear_class_state()
-
-
-def convert_phi3_model(model_id, output_dir, quantization_config):
- model_name = Path(model_id).name
- output_dir = Path(output_dir)
-
- lang_model_path = output_dir / "language_model.xml"
- image_embed_path = output_dir / "image_embed.xml"
- img_projection_path = output_dir / "img_projection.xml"
- embed_token_path = output_dir / "embed_token.xml"
-
- if all(
- [
- lang_model_path.exists(),
- image_embed_path.exists(),
- img_projection_path.exists(),
- embed_token_path.exists(),
- ]
- ):
- print(f"✅ {model_name} model already converted. You can find results in {output_dir}")
- return
- print(f"⌛ {model_name} conversion started. Be patient, it may takes some time.")
- print("⌛ Load Original model")
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, _attn_implementation="eager")
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
- if getattr(processor, "chat_template", None) is None:
- processor.chat_template = processor.tokenizer.chat_template
- model.config.save_pretrained(output_dir)
- processor.save_pretrained(output_dir)
- print("✅ Original model successfully loaded")
-
- if not embed_token_path.exists():
- print("⌛ Convert Input embedding model")
- ov_model = ov.convert_model(
- model.model.embed_tokens,
- example_input=torch.ones([2, 2], dtype=torch.int64),
- )
- ov.save_model(ov_model, embed_token_path)
- del ov_model
- cleanup_torchscript_cache()
- gc.collect()
- print("✅ Input embedding model successfully converted")
-
- vision_embed_tokens = model.model.vision_embed_tokens
- if not image_embed_path.exists():
- print("⌛ Convert Image embedding model")
- vision_embed_tokens.forward = vision_embed_tokens.get_img_features
- ov_model = ov.convert_model(vision_embed_tokens, example_input=torch.ones([17, 3, 336, 336]))
- ov.save_model(ov_model, image_embed_path)
- del ov_model
- cleanup_torchscript_cache()
- gc.collect()
- print("✅ Image embedding model successfully converted")
-
- if not img_projection_path.exists():
- print("⌛ Convert Image projection model")
- ov_model = ov.convert_model(
- vision_embed_tokens.img_projection,
- example_input=torch.ones([1, 1921, 4096]),
- )
- ov.save_model(ov_model, img_projection_path)
- del ov_model
- cleanup_torchscript_cache()
- gc.collect()
- print("✅ Image projection model successfully converted")
-
- if not lang_model_path.exists():
- print("⌛ Convert Language model")
-
- def forward_wrap(
- self,
- attention_mask,
- position_ids=None,
- past_key_values=None,
- inputs_embeds=None,
- ):
- result = self._orig_forward(
- input_ids=None,
- attention_mask=attention_mask,
- position_ids=position_ids,
- past_key_values=past_key_values,
- inputs_embeds=inputs_embeds,
- )
- return tuple(result.values())
-
- model._orig_forward = model.forward
- model.forward = types.MethodType(forward_wrap, model)
- llm_input = torch.zeros([2, 2, 3072])
- pkv = model(
- inputs_embeds=llm_input,
- attention_mask=torch.ones((2, 2), dtype=torch.int64),
- )[1]
- model_inputs = ["attention_mask", "position_ids"]
- model_outputs = ["logits"]
- for idx in range(len(pkv)):
- model_inputs.extend([f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"])
- model_outputs.extend([f"present.{idx}.key", f"present.{idx}.value"])
- model_inputs.append("inputs_embeds")
- position_ids = torch.tensor([[2, 3], [2, 3]])
- ov_model = ov.convert_model(
- model,
- example_input={
- "inputs_embeds": llm_input,
- "attention_mask": torch.ones([2, 4], dtype=torch.int64),
- "past_key_values": pkv,
- "position_ids": position_ids,
- },
- )
-
- for input, input_name in zip(ov_model.inputs, model_inputs):
- input.get_tensor().set_names({input_name})
-
- for output, output_name in zip(ov_model.outputs, model_outputs):
- output.get_tensor().set_names({output_name})
- patch_stateful(ov_model)
- print("✅ Language model successfully converted")
-
- if quantization_config is not None:
- print(f"⌛ Weights compression with {quantization_config['mode']} mode started")
- ov_model = nncf.compress_weights(ov_model, **quantization_config)
- print("✅ Weights compression finished")
-
- ov.save_model(ov_model, lang_model_path)
- del ov_model
- cleanup_torchscript_cache()
- del model
- gc.collect()
- print(f"✅ {model_name} model conversion finished. You can find results in {output_dir}")
-
-
-class OvPhi3Vision(GenerationMixin):
- def __init__(self, model_dir, device):
- model_dir = Path(model_dir)
- self.model = core.read_model(model_dir / "language_model.xml")
- self.image_embed = core.compile_model(model_dir / "image_embed.xml", device)
- self.img_projection = core.compile_model(model_dir / "img_projection.xml", device)
- self.embed_token = core.compile_model(model_dir / "embed_token.xml", device)
- self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
- self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
- compiled_model = core.compile_model(self.model, device)
- self.request = compiled_model.create_infer_request()
- self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
- self.generation_config = GenerationConfig.from_model_config(self.config)
- self.main_input_name = "input_ids"
- self.device = torch.device("cpu")
- self.num_pkv = 2
- self._supports_cache_class = False
- self.next_beam_idx = None
- self._past_length = None
- self.hd_transform_order = "glb_sub"
- self.num_img_tokens = self.config.img_processor["num_img_tokens"]
- self.image_dim_out = self.config.img_processor["image_dim_out"]
- self.glb_GN = torch.zeros([1, 1, self.image_dim_out * 4])
- self.sub_GN = torch.zeros([1, 1, 1, self.image_dim_out * 4])
-
- def can_generate(self):
- """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
- return True
-
- def __call__(
- self,
- input_ids: torch.LongTensor,
- pixel_values: torch.Tensor,
- attention_mask: Optional[torch.LongTensor] = None,
- past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
- position_ids: Optional[torch.LongTensor] = None,
- image_sizes=None,
- **kwargs,
- ) -> CausalLMOutputWithPast:
- return self.forward(
- input_ids=input_ids,
- pixel_values=pixel_values,
- attention_mask=attention_mask,
- past_key_values=past_key_values,
- position_ids=position_ids,
- image_sizes=image_sizes,
- **kwargs,
- )
-
- def forward(
- self,
- input_ids: torch.LongTensor = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[List[torch.FloatTensor]] = None,
- inputs_embeds: Optional[torch.FloatTensor] = None,
- pixel_values: Optional[torch.FloatTensor] = None,
- image_sizes: Optional[torch.LongTensor] = None,
- **kwargs,
- ) -> Union[Tuple, BaseModelOutputWithPast]:
- if inputs_embeds is None:
- if pixel_values is not None and image_sizes is not None:
- inputs_embeds = self.vision_embed_tokens(input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
- else:
- inputs_embeds = self.embed_token(input_ids)[0]
- if past_key_values is None:
- self.request.reset_state()
- self.next_beam_idx = np.arange(inputs_embeds.shape[0], dtype=int)
- self._past_length = 0
- inputs = {}
- inputs["inputs_embeds"] = inputs_embeds
- inputs["attention_mask"] = attention_mask
- inputs["position_ids"] = position_ids
- if "beam_idx" in self.input_names:
- inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(inputs_embeds.shape[0], dtype=int)
- self.request.start_async(inputs, share_inputs=True)
- self.request.wait()
- logits = self.request.get_tensor("logits").data
- logits = torch.from_numpy(logits).to(self.device)
- past_key_values = ((),)
- self._past_length += inputs["inputs_embeds"].shape[1]
-
- return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
-
- def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
- """
- This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
- [`~PreTrainedModel.beam_sample`] is called.
- This is required to match `past_key_values` with the correct beam_idx at every generation step.
- """
- self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration
- return past_key_values
-
- def _get_past_length(self, past_key_values=None):
- if past_key_values is None:
- return 0
- return self._past_length
-
- def prepare_inputs_for_generation(
- self,
- input_ids,
- past_key_values=None,
- attention_mask=None,
- inputs_embeds=None,
- pixel_values=None,
- image_sizes=None,
- **kwargs,
- ):
- if past_key_values is not None:
- past_length = self._get_past_length(past_key_values)
-
- # Keep only the unprocessed tokens:
- # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
- # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
- # input)
- if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
- input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
- # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
- # input_ids based on the past_length.
- elif past_length < input_ids.shape[1]:
- input_ids = input_ids[:, past_length:]
-
- position_ids = kwargs.get("position_ids", None)
- if attention_mask is not None and position_ids is None:
- # create position_ids on the fly for batch generation
- position_ids = attention_mask.long().cumsum(-1) - 1
- position_ids.masked_fill_(attention_mask == 0, 1)
- if past_key_values:
- position_ids = position_ids[:, -input_ids.shape[1] :]
-
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
- if inputs_embeds is not None and past_key_values is None:
- model_inputs = {"inputs_embeds": inputs_embeds}
- else:
- model_inputs = {"input_ids": input_ids}
-
- model_inputs.update(
- {
- "position_ids": position_ids,
- "past_key_values": past_key_values,
- "use_cache": kwargs.get("use_cache"),
- "attention_mask": attention_mask,
- "pixel_values": pixel_values,
- "image_sizes": image_sizes,
- }
- )
- return model_inputs
-
- def vision_embed_tokens(
- self,
- input_ids: torch.LongTensor,
- pixel_values: torch.FloatTensor,
- image_sizes=None,
- ) -> torch.FloatTensor:
- MAX_INPUT_ID = int(1e9)
- img_embeds = pixel_values
- img_sizes = image_sizes
-
- input_shape = input_ids.size()
- input_ids = input_ids.view(-1, input_shape[-1])
-
- with torch.no_grad():
- positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=False)
-
- select = False
- if len(positions.tolist()) > 0:
- g_values = abs(input_ids[positions[:, 0], positions[:, 1]])
-
- if img_sizes is not None and len(img_sizes):
- hd_transform = True
- bs = img_embeds.shape[0]
- # Nx(HW)xC
- img_features = torch.from_numpy(self.image_embed(img_embeds.flatten(0, 1))[0])
- base_feat_height = base_feat_width = int(img_features.shape[1] ** 0.5)
-
- # bs x max_num_crops x (24x24) x C
- img_features = img_features.view(bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
- C = self.image_dim_out
- H = base_feat_height
-
- output_imgs = []
- output_len = []
- # training is tensor, inference is list
- if isinstance(img_sizes, torch.Tensor):
- img_sizes = img_sizes.view(-1, 2)
- for _bs in range(bs):
- h, w = img_sizes[_bs]
- h = h // 336
- w = w // 336
- B_ = h * w
-
- # 1 x (24x24) x 1024
- global_img_feature = img_features[_bs, :1]
-
- # 1 x 12 x 12 x 4096
- glb_img = (
- global_img_feature.reshape(1, H, H, C)
- .reshape(1, H // 2, 2, H // 2, 2, C)
- .contiguous()
- .permute(0, 1, 3, 2, 4, 5)
- .reshape(1, H // 2, H // 2, 4 * C)
- .contiguous()
- )
- temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1)
-
- # 1 x 156 x 4096
- glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(1, -1, 4 * C)
-
- # (max_num_crops-1) x (12x12) x C
- sub_img = img_features[_bs, 1:]
- # 16x574x1024
- # get rid of padding sub_img
- sub_img = sub_img[:B_]
-
- # (num_crops, 12, 2, 12, 2, 1024) -> (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
- sub_img = (
- sub_img.reshape(B_, H, H, C)
- .reshape(B_, H // 2, 2, H // 2, 2, C)
- .contiguous()
- .permute(0, 1, 3, 2, 4, 5)
- .reshape(B_, -1, 4 * C)
- .contiguous()
- )
- sub_img = sub_img.reshape(1, h, w, 12, 12, -1).permute(0, 1, 3, 2, 4, 5).reshape(1, h * 12, w * 12, 4 * C)
- temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1)
- sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(1, -1, 4 * C)
- # (1, num_img_tokens, 1024*4)
-
- # glb + sub
- if self.hd_transform_order == "glb_sub":
- output_imgs.append(torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
- elif self.hd_transform_order == "sub_glb":
- output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
- else:
- raise NotImplementedError(f"hd_transform_order = {self.hd_transform_order}, not implemented")
-
- temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
- output_len.append(temp_len)
-
- num_img_tokens = output_len
- img_set_tensor = []
- for _output_img in output_imgs:
- img_feature_proj = torch.from_numpy(self.img_projection(_output_img)[0])
- img_set_tensor.append(img_feature_proj)
- elif img_embeds.ndim == 4:
- selected_g_values = g_values[:: self.num_img_tokens]
- tt = self.image_embed(img_embeds).reshape(-1, self.image_dim_out)[0]
- img_set_tensor = torch.from_numpy(self.img_projection(tt)[0]) # adapted visual features.
- elif img_embeds.ndim == 3:
- selected_g_values = g_values[:: self.num_img_tokens]
- tt = img_embeds.view(-1, self.image_dim_out)
- img_set_tensor = torch.from_numpy(self.img_projection(tt)[0]) # adapted visual features.
- else:
- raise NotImplementedError
- select = True
- input_ids.clamp_min_(0).clamp_max_(self.config.vocab_size)
-
- hidden_states = torch.from_numpy(self.embed_token(input_ids)[0])
- if select:
- if hd_transform:
- idx = 0
- for i, cnt in enumerate(num_img_tokens):
- hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = img_set_tensor[i]
- idx += cnt
- else:
- idx = 0
- for i, g in enumerate(selected_g_values):
- cnt = self.num_img_tokens
- hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = img_set_tensor[i * cnt : (i + 1) * cnt]
- idx += cnt
-
- return hidden_states
diff --git a/notebooks/phi-3-vision/phi-3-vision.ipynb b/notebooks/phi-3-vision/phi-3-vision.ipynb
index b5f28e5697b..2414e247d49 100644
--- a/notebooks/phi-3-vision/phi-3-vision.ipynb
+++ b/notebooks/phi-3-vision/phi-3-vision.ipynb
@@ -10,7 +10,7 @@
"\n",
"The Phi-3-Vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures. More details about model can be found in [model blog post](https://azure.microsoft.com/en-us/blog/new-models-added-to-the-phi-3-family-available-on-microsoft-azure/), [technical report](https://aka.ms/phi3-tech-report), [Phi-3-cookbook](https://github.com/microsoft/Phi-3CookBook)\n",
"\n",
- "In this tutorial we consider how to launch Phi-3-vision using OpenVINO for creation multimodal chatbot. Additionally, we optimize model to low precision using [NNCF](https://github.com/openvinotoolkit/nncf)\n",
+ "In this tutorial we consider how to use Phi-3-Vision model to build multimodal chatbot using [Optimum Intel](https://github.com/huggingface/optimum-intel). Additionally, we optimize model to low precision using [NNCF](https://github.com/openvinotoolkit/nncf)\n",
"#### Table of contents:\n",
"\n",
"- [Prerequisites](#Prerequisites)\n",
@@ -51,8 +51,9 @@
"metadata": {},
"outputs": [],
"source": [
- "%pip install -q \"torch>=2.1\" \"torchvision\" \"transformers>=4.40\" \"protobuf>=3.20\" \"gradio>=4.26\" \"Pillow\" \"accelerate\" \"tqdm\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
- "%pip install -q \"openvino>=2024.2.0\" \"nncf>=2.11.0\""
+ "%pip install -q \"torch>=2.1\" \"torchvision\" \"transformers>=4.45\" \"protobuf>=3.20\" \"gradio>=4.26\" \"Pillow\" \"accelerate\" \"tqdm\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
+ "%pip install -q -U \"openvino>=2024.6.0\" \"openvino-tokenizrs>=2024.6.0\" \"nncf>=2.14.0\"\n",
+ "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu"
]
},
{
@@ -65,9 +66,9 @@
"import requests\n",
"from pathlib import Path\n",
"\n",
- "if not Path(\"ov_phi3_vision_helper.py\").exists():\n",
- " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/phi-3-vision/ov_phi3_vision_helper.py\")\n",
- " open(\"ov_phi3_vision_helper.py\", \"w\").write(r.text)\n",
+ "if not Path(\"cmd_helper.py\").exists():\n",
+ " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py\")\n",
+ " open(\"cmd_helper.py\", \"w\").write(r.text)\n",
"\n",
"\n",
"if not Path(\"gradio_helper.py\").exists():\n",
@@ -97,14 +98,14 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 3,
"id": "0fe78ef3",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "efd492314046473cbadf8dd818c2c5cd",
+ "model_id": "7af21c571029466fa8856ffbf1b325d2",
"version_major": 2,
"version_minor": 0
},
@@ -112,7 +113,7 @@
"Dropdown(description='Model:', options=('microsoft/Phi-3.5-vision-instruct', 'microsoft/Phi-3-vision-128k-inst…"
]
},
- "execution_count": 1,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -136,6 +137,26 @@
"model_dropdown"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "4470a3c4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Selected microsoft/Phi-3.5-vision-instruct\n"
+ ]
+ }
+ ],
+ "source": [
+ "model_id = model_dropdown.value\n",
+ "print(f\"Selected {model_id}\")\n",
+ "MODEL_DIR = Path(model_id.split(\"/\")[-1])"
+ ]
+ },
{
"attachments": {},
"cell_type": "markdown",
@@ -147,26 +168,22 @@
"\n",
"Phi-3-vision is PyTorch model. OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate Representation (IR). [OpenVINO model conversion API](https://docs.openvino.ai/2024/openvino-workflow/model-preparation.html#convert-a-model-with-python-convert-model) should be used for these purposes. `ov.convert_model` function accepts original PyTorch model instance and example input for tracing and returns `ov.Model` representing this model in OpenVINO framework. Converted model can be used for saving on disk using `ov.save_model` function or directly loading on device using `core.complie_model`. \n",
"\n",
- "The script `ov_phi3_vision_helper.py` contains helper function for model conversion, please check its content if you interested in conversion details.\n",
+ "OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate Representation format. For convenience, we will use OpenVINO integration with HuggingFace Optimum. 🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.\n",
"\n",
- "\n",
- " Click here for more detailed explanation of conversion steps
\n",
- "Phi-3-vision is autoregressive transformer generative model, it means that each next model step depends from model output from previous step. The generation approach is based on the assumption that the probability distribution of a word sequence can be decomposed into the product of conditional next word distributions. In other words, model predicts the next token in the loop guided by previously generated tokens until the stop-condition will be not reached (generated sequence of maximum length or end of string token obtained). The way the next token will be selected over predicted probabilities is driven by the selected decoding methodology. You can find more information about the most popular decoding methods in this blog. The entry point for the generation process for models from the Hugging Face Transformers library is the `generate` method. You can find more information about its parameters and configuration in the documentation. To preserve flexibility in the selection decoding methodology, we will convert only model inference for one step.\n",
+ "Among other use cases, Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. `optimum-cli` provides command line interface for model conversion and optimization. \n",
"\n",
- "The inference flow has difference on first step and for the next. On the first step, model accept preprocessed input instruction and image, that transformed to the unified embedding space using `input_embedding` and `image_encoder` models, after that `language model`, LLM-based part of model, runs on input embeddings to predict probability of next generated tokens. On the next step, `language_model` accepts only next token id selected based on sampling strategy and processed by `input_embedding` model and cached attention key and values. Since the output side is auto-regressive, an output token hidden state remains the same once computed for every further generation step. Therefore, recomputing it every time you want to generate a new token seems wasteful. With the cache, the model saves the hidden state once it has been computed. The model only computes the one for the most recently generated output token at each time step, re-using the saved ones for hidden tokens. This reduces the generation complexity from $O(n^3)$ to $O(n^2)$ for a transformer model. More details about how it works can be found in this [article](https://scale.com/blog/pytorch-improvements#Text%20Translation). For improving support images of various resolution, input image separated on patches and processed by `image feature extractor` and `image projector` that are part of image encoder.\n",
+ "General command format:\n",
"\n",
- "To sum up above, model consists of 4 parts:\n",
+ "```bash\n",
+ "optimum-cli export openvino --model --task \n",
+ "```\n",
"\n",
- "* **Image feature extractor** and **Image projector** for encoding input images into embedding space.\n",
- "* **Input Embedding** for conversion input text tokens into embedding space\n",
- "* **Language Model** for generation answer based on input embeddings provided by Image Encoder and Input Embedding models.\n",
- "\n",
- " \n",
+ "where task is task to export the model for, if not specified, the task will be auto-inferred based on the model. You can find a mapping between tasks and model classes in Optimum TaskManager [documentation](https://huggingface.co/docs/optimum/exporters/task_manager). Additionally, you can specify weights compression using `--weight-format` argument with one of following options: `fp32`, `fp16`, `int8` and `int4`. Fro int8 and int4 [nncf](https://github.com/openvinotoolkit/nncf) will be used for weight compression. More details about model export provided in [Optimum Intel documentation](https://huggingface.co/docs/optimum/intel/openvino/export#export-your-model).\n",
"\n",
"\n",
"### Compress model weights to 4-bit\n",
"[back to top ⬆️](#Table-of-contents:)\n",
- "For reducing memory consumption, weights compression optimization can be applied using [NNCF](https://github.com/openvinotoolkit/nncf). \n",
+ "For reducing memory consumption, weights compression optimization can be applied using [NNCF](https://github.com/openvinotoolkit/nncf) during run Optimum Intel CLI.\n",
"\n",
"\n",
" Click here for more details about weight compression
\n",
@@ -186,217 +203,111 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 5,
"id": "72664892-11e5-4164-a1ef-e8631b50f232",
"metadata": {},
"outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino\n"
- ]
- }
- ],
- "source": [
- "from ov_phi3_vision_helper import convert_phi3_model\n",
- "\n",
- "# uncomment these lines to see model conversion code\n",
- "# convert_phi3_model??"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "7efafbc8",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "⌛ Phi-3.5-vision-instruct conversion started. Be patient, it may takes some time.\n",
- "⌛ Load Original model\n"
- ]
- },
{
"data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "487eabdfa7d1472a85a67b00fe27b2a0",
- "version_major": 2,
- "version_minor": 0
- },
+ "text/markdown": [
+ "**Export command:**"
+ ],
"text/plain": [
- "Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
+ ""
]
},
"metadata": {},
"output_type": "display_data"
},
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/transformers/models/auto/image_processing_auto.py:513: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "✅ Original model successfully loaded\n",
- "⌛ Convert Input embedding model\n",
- "WARNING:nncf:NNCF provides best results with torch==2.3.*, while current torch version is 2.4.1+cpu. If you encounter issues, consider switching to torch==2.3.*\n",
- "✅ Input embedding model successfully converted\n",
- "⌛ Convert Image embedding model\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/transformers/modeling_utils.py:4713: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "✅ Image embedding model successfully converted\n",
- "⌛ Convert Image projection model\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "You are not running the flash-attention implementation, expect numerical differences.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "✅ Image projection model successfully converted\n",
- "⌛ Convert Language model\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/transformers/modeling_attn_mask_utils.py:114: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:\n",
- "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/transformers/modeling_attn_mask_utils.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if past_key_values_length > 0:\n",
- "/home/ytarkan/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/c69b58c1be8dfda05972fdf495be3511e3b0f61f/modeling_phi3_v.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if seq_len > self.original_max_position_embeddings:\n",
- "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/nncf/torch/dynamic_graph/wrappers.py:86: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
- " op1 = operator(*args, **kwargs)\n",
- "/home/ytarkan/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/c69b58c1be8dfda05972fdf495be3511e3b0f61f/modeling_phi3_v.py:683: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):\n",
- "/home/ytarkan/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/c69b58c1be8dfda05972fdf495be3511e3b0f61f/modeling_phi3_v.py:690: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):\n",
- "/home/ytarkan/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/c69b58c1be8dfda05972fdf495be3511e3b0f61f/modeling_phi3_v.py:702: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
- " if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):\n",
- "/home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/torch/jit/_trace.py:168: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.)\n",
- " if a.grad is not None:\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "✅ Language model successfully converted\n",
- "⌛ Weights compression with int4_sym mode started\n"
- ]
- },
{
"data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "60270b899d7746b0b630353202afc90c",
- "version_major": 2,
- "version_minor": 0
- },
+ "text/markdown": [
+ "`optimum-cli export openvino --model microsoft/Phi-3.5-vision-instruct Phi-3.5-vision-instruct/INT4 --weight-format int4 --trust-remote-code`"
+ ],
"text/plain": [
- "Output()"
+ ""
]
},
"metadata": {},
"output_type": "display_data"
},
{
- "data": {
- "text/html": [
- "\n"
- ],
- "text/plain": []
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
+ "name": "stderr",
"output_type": "stream",
"text": [
- "INFO:nncf:Statistics of the bitwidth distribution:\n",
- "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
- "│ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n",
- "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
- "│ 8 │ 42% (54 / 129) │ 40% (53 / 128) │\n",
- "├────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n",
- "│ 4 │ 58% (75 / 129) │ 60% (75 / 128) │\n",
- "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n"
+ "2024-12-24 08:39:28.193255: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+ "2024-12-24 08:39:28.205380: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+ "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
+ "E0000 00:00:1735015168.220063 230613 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+ "E0000 00:00:1735015168.224457 230613 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+ "2024-12-24 08:39:28.238718: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+ "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.14s/it]\n",
+ "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release.\n",
+ "WARNING:root:Cannot apply model.to_bettertransformer because of the exception:\n",
+ "The model type phi3_v is not yet supported to be used with BetterTransformer. Feel free to open an issue at https://github.com/huggingface/optimum/issues if you would like this model type to be supported. Currently supported models are: dict_keys(['albert', 'bark', 'bart', 'bert', 'bert-generation', 'blenderbot', 'bloom', 'camembert', 'blip-2', 'clip', 'codegen', 'data2vec-text', 'deit', 'distilbert', 'electra', 'ernie', 'fsmt', 'gpt2', 'gptj', 'gpt_neo', 'gpt_neox', 'hubert', 'layoutlm', 'm2m_100', 'marian', 'markuplm', 'mbart', 'opt', 'pegasus', 'rembert', 'prophetnet', 'roberta', 'roc_bert', 'roformer', 'splinter', 'tapas', 't5', 'vilt', 'vit', 'vit_mae', 'vit_msn', 'wav2vec2', 'xlm-roberta', 'yolos']).. Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention\n",
+ "`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.\n",
+ "/home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.\n",
+ " or len(self.key_cache[layer_idx]) == 0 # the layer has no cache\n",
+ "/home/ea/work/py311/lib/python3.11/site-packages/transformers/modeling_attn_mask_utils.py:116: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+ " if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:\n",
+ "/home/ea/work/py311/lib/python3.11/site-packages/optimum/exporters/onnx/model_patcher.py:306: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+ " if past_key_values_length > 0:\n",
+ "/home/ea/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:444: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+ " seq_len = seq_len or torch.max(position_ids) + 1\n",
+ "/home/ea/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+ " if seq_len > self.original_max_position_embeddings:\n",
+ "/home/ea/work/py311/lib/python3.11/site-packages/nncf/torch/dynamic_graph/wrappers.py:85: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+ " op1 = operator(*args, **kwargs)\n",
+ "/home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.\n",
+ " elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors\n",
+ "/home/ea/work/py311/lib/python3.11/site-packages/transformers/models/clip/modeling_clip.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+ " if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):\n"
]
},
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "cfdf543cc4f0411d9df05525abab5397",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Output()"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n"
- ],
- "text/plain": []
- },
- "metadata": {},
- "output_type": "display_data"
- },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "✅ Weights compression finished\n",
- "✅ Phi-3.5-vision-instruct model conversion finished. You can find results in model/Phi-3.5-vision-instruct/INT4\n"
+ "INFO:nncf:Statistics of the bitwidth distribution:\n",
+ "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+ "│ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n",
+ "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+ "│ int8_asym │ 3% (1 / 129) │ 0% (0 / 128) │\n",
+ "├───────────────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n",
+ "│ int4_asym │ 97% (128 / 129) │ 100% (128 / 128) │\n",
+ "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+ "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:01:58\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m;0;104;181m0:00:01\u001b[0m181m0:00:05\u001b[0m\n",
+ "\u001b[?25hINFO:nncf:Statistics of the bitwidth distribution:\n",
+ "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+ "│ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n",
+ "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+ "│ int8_sym │ 100% (139 / 139) │ 100% (139 / 139) │\n",
+ "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+ "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:01\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m01\u001b[0m • \u001b[38;2;0;104;181m0:00:01\u001b[0m\n",
+ "\u001b[?25hINFO:nncf:Statistics of the bitwidth distribution:\n",
+ "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+ "│ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n",
+ "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+ "│ int8_sym │ 100% (1 / 1) │ 100% (1 / 1) │\n",
+ "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+ "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n",
+ "\u001b[?25hINFO:nncf:Statistics of the bitwidth distribution:\n",
+ "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+ "│ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n",
+ "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
+ "│ int8_sym │ 100% (2 / 2) │ 100% (2 / 2) │\n",
+ "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n",
+ "\u001b[2KApplying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n",
+ "\u001b[?25h"
]
}
],
"source": [
- "from pathlib import Path\n",
- "import nncf\n",
- "\n",
+ "from cmd_helper import optimum_cli\n",
"\n",
- "model_id = model_dropdown.value\n",
- "out_dir = Path(\"model\") / Path(model_id).name / \"INT4\"\n",
- "compression_configuration = {\n",
- " \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n",
- " \"group_size\": 64,\n",
- " \"ratio\": 0.6,\n",
- "}\n",
- "convert_phi3_model(model_id, out_dir, compression_configuration)"
+ "if not (MODEL_DIR / \"INT4\").exists():\n",
+ " optimum_cli(model_id, MODEL_DIR / \"INT4\", additional_args={\"weight-format\": \"int4\", \"trust-remote-code\": \"\"})"
]
},
{
@@ -411,22 +322,22 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 6,
"id": "881cfdca",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "b2dc3a32cec74410a8978dcd470c905b",
+ "model_id": "4e17d407db3f4ee8b0a6354eff65291a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
- "Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO')"
+ "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')"
]
},
- "execution_count": 4,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -448,36 +359,24 @@
"## Run OpenVINO model\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
- "`OvPhi3vison` class provides convenient way for running model. It accepts directory with converted model and inference device as arguments. For running model we will use `generate` method."
+ "OpenVINO integration with Optimum Intel provides ready-to-use API for model inference that can be used for smooth integration with transformers-based solutions. For loading model, we will use `OVModelForVisualCausalLM` class that have compatible interface with Transformers LLaVA implementation. For loading a model, `from_pretrained` method should be used. It accepts path to the model directory or model_id from HuggingFace hub (if model is not converted to OpenVINO format, conversion will be triggered automatically). Additionally, we can provide an inference device, quantization config (if model has not been quantized yet) and device-specific OpenVINO Runtime configuration. More details about model inference with Optimum Intel can be found in [documentation](https://huggingface.co/docs/optimum/intel/openvino/inference)."
]
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "22e6d173-1a44-4a40-a29d-186e4c1de60d",
+ "execution_count": 9,
+ "id": "bbb34970-45bb-405f-a362-50992b30f765",
"metadata": {},
"outputs": [],
"source": [
- "from ov_phi3_vision_helper import OvPhi3Vision\n",
- "\n",
- "# Uncomment below lines to see the model inference class code\n",
+ "from optimum.intel.openvino import OVModelForVisualCausalLM\n",
"\n",
- "# OvPhi3Vision??"
+ "model = OVModelForVisualCausalLM.from_pretrained(MODEL_DIR / \"INT4\", device=device.value, trust_remote_code=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
- "id": "bbb34970-45bb-405f-a362-50992b30f765",
- "metadata": {},
- "outputs": [],
- "source": [
- "model = OvPhi3Vision(out_dir, device.value)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
"id": "bf7b045b-89ea-48e8-bee3-9c61968a6a91",
"metadata": {
"tags": []
@@ -499,7 +398,7 @@
""
]
},
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -517,7 +416,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 11,
"id": "24d3eed7",
"metadata": {},
"outputs": [
@@ -526,7 +425,7 @@
"output_type": "stream",
"text": [
"Answer:\n",
- "The cat is lying on its back inside a cardboard box, which is not a common position for a cat to be in.\n"
+ "A cat is lying in a box.\n"
]
}
],
@@ -537,7 +436,7 @@
" {\"role\": \"user\", \"content\": \"<|image_1|>\\nWhat is unusual on this picture?\"},\n",
"]\n",
"\n",
- "processor = AutoProcessor.from_pretrained(out_dir, trust_remote_code=True)\n",
+ "processor = AutoProcessor.from_pretrained(MODEL_DIR / \"INT4\", trust_remote_code=True)\n",
"\n",
"prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
"\n",
@@ -561,7 +460,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"id": "ba58378b",
"metadata": {
"tags": []
@@ -598,7 +497,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.19"
+ "version": "3.11.4"
},
"openvino_notebooks": {
"imageUrl": "https://github.com/user-attachments/assets/a0c07db9-69d4-4dea-a8fc-424c02ccebf4",
@@ -618,7 +517,66 @@
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
- "state": {},
+ "state": {
+ "2d7fd7109d834307b1a02a93119cf551": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "description_width": ""
+ }
+ },
+ "4e17d407db3f4ee8b0a6354eff65291a": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "DropdownModel",
+ "state": {
+ "_options_labels": [
+ "CPU",
+ "AUTO"
+ ],
+ "description": "Device:",
+ "index": 1,
+ "layout": "IPY_MODEL_5afc48d3822142e7a90a4857b9986648",
+ "style": "IPY_MODEL_df83c50e820740ff984874f1e1998f3c"
+ }
+ },
+ "5afc48d3822142e7a90a4857b9986648": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {}
+ },
+ "7af21c571029466fa8856ffbf1b325d2": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "DropdownModel",
+ "state": {
+ "_options_labels": [
+ "microsoft/Phi-3.5-vision-instruct",
+ "microsoft/Phi-3-vision-128k-instruct"
+ ],
+ "description": "Model:",
+ "index": 0,
+ "layout": "IPY_MODEL_ed00261277de4392a547a7ce03d1fc8d",
+ "style": "IPY_MODEL_2d7fd7109d834307b1a02a93119cf551"
+ }
+ },
+ "df83c50e820740ff984874f1e1998f3c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "2.0.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "description_width": ""
+ }
+ },
+ "ed00261277de4392a547a7ce03d1fc8d": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "2.0.0",
+ "model_name": "LayoutModel",
+ "state": {}
+ }
+ },
"version_major": 2,
"version_minor": 0
}