-
Notifications
You must be signed in to change notification settings - Fork 233
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
132 additions
and
0 deletions.
There are no files selected for viewing
26 changes: 26 additions & 0 deletions
26
examples/llm_compression/openvino/smollm_360m_fp8/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Large Language Models FP8 Compression Example | ||
|
||
This example demonstrates how to optimize Large Language Models (LLMs) using NNCF quantize API. The example applies FP8 quantization to [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) model. This leads to a significant decrease in model footprint and performance improvement with OpenVINO. | ||
|
||
## Prerequisites | ||
|
||
To use this example: | ||
|
||
- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate` | ||
- Install dependencies: | ||
|
||
```bash | ||
pip install -U pip | ||
pip install -r requirements.txt | ||
pip install ../../../../ | ||
``` | ||
|
||
## Run Example | ||
|
||
To run example: | ||
|
||
```bash | ||
python main.py | ||
``` | ||
|
||
It will automatically download the dataset and baseline model and save the resulting model. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
# Copyright (c) 2024 Intel Corporation | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
from functools import partial | ||
|
||
import datasets | ||
import numpy as np | ||
import openvino as ov | ||
from optimum.intel.openvino import OVModelForCausalLM | ||
from transformers import AutoTokenizer | ||
|
||
import nncf | ||
|
||
|
||
def main(): | ||
MODEL_ID = "HuggingFaceTB/SmolLM-360M" | ||
OUTPUT_DIR = "smollm_360m_copmressed" | ||
|
||
dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="test") | ||
# Filtering to remove empty samples from the dataset | ||
dataset = dataset.filter(lambda example: len(example["text"]) > 1) | ||
|
||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | ||
model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False, stateful=False) | ||
|
||
def transform_fn(data, model, tokenizer): | ||
tokenized_text = tokenizer(data["text"], return_tensors="np") | ||
input_ids = tokenized_text["input_ids"] | ||
attention_mask = tokenized_text["attention_mask"] | ||
|
||
inputs = {} | ||
inputs["input_ids"] = input_ids | ||
inputs["attention_mask"] = tokenized_text["attention_mask"] | ||
position_ids = np.cumsum(attention_mask, axis=1) - 1 | ||
position_ids[attention_mask == 0] = 1 | ||
|
||
# The magic forms KV cache as model inputs | ||
batch_size = input_ids.shape[0] | ||
for input_name in model.key_value_input_names: | ||
model_inputs = model.model.input(input_name) | ||
shape = model_inputs.get_partial_shape() | ||
shape[0] = batch_size | ||
if shape[2].is_dynamic: | ||
shape[2] = 0 | ||
else: | ||
shape[1] = 0 | ||
inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape()) | ||
|
||
inputs["position_ids"] = position_ids | ||
return inputs | ||
|
||
quantization_dataset = nncf.Dataset(dataset, partial(transform_fn, model=model, tokenizer=tokenizer)) | ||
|
||
model.model = nncf.quantize( | ||
model.model, | ||
calibration_dataset=quantization_dataset, | ||
# Only PERFORMANCE preset supports in combination with FP8 quantization mode | ||
preset=nncf.QuantizationPreset.PERFORMANCE, | ||
mode=nncf.QuantizationMode.FP8_E4M3, | ||
model_type=nncf.ModelType.TRANSFORMER, | ||
# SmoothQuant algorithm is not needed for FP8 quantization | ||
advanced_parameters=nncf.AdvancedQuantizationParameters( | ||
smooth_quant_alphas=nncf.AdvancedSmoothQuantParameters(matmul=-1) | ||
), | ||
) | ||
model.save_pretrained(OUTPUT_DIR) | ||
tokenizer.save_pretrained(OUTPUT_DIR) | ||
|
||
model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"}) | ||
input_ids = tokenizer("What is Python?", return_tensors="pt").to(device=model.device) | ||
|
||
output = model.generate(**input_ids, max_new_tokens=100) | ||
output_text = tokenizer.decode(output[0]) | ||
print(f"Optimized model output: {output_text}\n") | ||
return output_text | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
4 changes: 4 additions & 0 deletions
4
examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
datasets | ||
openvino==2024.4 | ||
optimum-intel[openvino]>=1.13.0 | ||
transformers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters