diff --git a/examples/llm_compression/openvino/smollm_360m_fp8/README.md b/examples/llm_compression/openvino/smollm_360m_fp8/README.md new file mode 100644 index 00000000000..158be1b0480 --- /dev/null +++ b/examples/llm_compression/openvino/smollm_360m_fp8/README.md @@ -0,0 +1,26 @@ +# Large Language Models FP8 Compression Example + +This example demonstrates how to optimize Large Language Models (LLMs) using NNCF quantize API. The example applies FP8 quantization to [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) model. This leads to a significant decrease in model footprint and performance improvement with OpenVINO. + +## Prerequisites + +To use this example: + +- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate` +- Install dependencies: + +```bash +pip install -U pip +pip install -r requirements.txt +pip install ../../../../ +``` + +## Run Example + +To run example: + +```bash +python main.py +``` + +It will automatically download the dataset and baseline model and save the resulting model. diff --git a/examples/llm_compression/openvino/smollm_360m_fp8/main.py b/examples/llm_compression/openvino/smollm_360m_fp8/main.py new file mode 100644 index 00000000000..3a5c54fdf2b --- /dev/null +++ b/examples/llm_compression/openvino/smollm_360m_fp8/main.py @@ -0,0 +1,86 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import partial + +import datasets +import numpy as np +import openvino as ov +from optimum.intel.openvino import OVModelForCausalLM +from transformers import AutoTokenizer + +import nncf + + +def main(): + MODEL_ID = "HuggingFaceTB/SmolLM-360M" + OUTPUT_DIR = "smollm_360m_copmressed" + + dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + # Filtering to remove empty samples from the dataset + dataset = dataset.filter(lambda example: len(example["text"]) > 1) + + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False, stateful=False) + + def transform_fn(data, model, tokenizer): + tokenized_text = tokenizer(data["text"], return_tensors="np") + input_ids = tokenized_text["input_ids"] + attention_mask = tokenized_text["attention_mask"] + + inputs = {} + inputs["input_ids"] = input_ids + inputs["attention_mask"] = tokenized_text["attention_mask"] + position_ids = np.cumsum(attention_mask, axis=1) - 1 + position_ids[attention_mask == 0] = 1 + + # The magic forms KV cache as model inputs + batch_size = input_ids.shape[0] + for input_name in model.key_value_input_names: + model_inputs = model.model.input(input_name) + shape = model_inputs.get_partial_shape() + shape[0] = batch_size + if shape[2].is_dynamic: + shape[2] = 0 + else: + shape[1] = 0 + inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape()) + + inputs["position_ids"] = position_ids + return inputs + + quantization_dataset = nncf.Dataset(dataset, partial(transform_fn, model=model, tokenizer=tokenizer)) + + model.model = nncf.quantize( + model.model, + calibration_dataset=quantization_dataset, + # Only PERFORMANCE preset supports in combination with FP8 quantization mode + preset=nncf.QuantizationPreset.PERFORMANCE, + mode=nncf.QuantizationMode.FP8_E4M3, + model_type=nncf.ModelType.TRANSFORMER, + # SmoothQuant algorithm is not needed for FP8 quantization + advanced_parameters=nncf.AdvancedQuantizationParameters( + smooth_quant_alphas=nncf.AdvancedSmoothQuantParameters(matmul=-1) + ), + ) + model.save_pretrained(OUTPUT_DIR) + tokenizer.save_pretrained(OUTPUT_DIR) + + model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"}) + input_ids = tokenizer("What is Python?", return_tensors="pt").to(device=model.device) + + output = model.generate(**input_ids, max_new_tokens=100) + output_text = tokenizer.decode(output[0]) + print(f"Optimized model output: {output_text}\n") + return output_text + + +if __name__ == "__main__": + main() diff --git a/examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt b/examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt new file mode 100644 index 00000000000..3072dbf7969 --- /dev/null +++ b/examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt @@ -0,0 +1,4 @@ +datasets +openvino==2024.4 +optimum-intel[openvino]>=1.13.0 +transformers diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json index 5992dc8c57e..1ef7d1972a5 100644 --- a/tests/cross_fw/examples/example_scope.json +++ b/tests/cross_fw/examples/example_scope.json @@ -260,5 +260,13 @@ "int8_model_size": 5.677968978881836, "model_compression_rate": 3.7654144877995197 } + }, + "fp8_llm_quantization": { + "backend": "openvino", + "requirements": "/home/nmali/work/nncf/examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt", + "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz", + "accuracy_metrics": { + "word_count": 75 + } } } diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py index 5039b410d28..3ab0b861cab 100644 --- a/tests/cross_fw/examples/run_example.py +++ b/tests/cross_fw/examples/run_example.py @@ -184,6 +184,14 @@ def llm_compression_synthetic() -> Dict[str, float]: return {"word_count": len(result.split())} +def fp8_llm_quantization() -> Dict[str, float]: + from examples.llm_compression.openvino.smollm_360m_fp8.main import main as fp8_llm_quantization_main + + result = fp8_llm_quantization_main() + + return {"word_count": len(result.split())} + + def post_training_quantization_torch_fx_resnet18(): from examples.post_training_quantization.torch_fx.resnet18.main import main as resnet18_main