Added FP8 example

openvinotoolkit · Nov 5, 2024 · e59e941 · e59e941
1 parent 34cb441
commit e59e941
Show file tree

Hide file tree

Showing 5 changed files with 132 additions and 0 deletions.
diff --git a/examples/llm_compression/openvino/smollm_360m_fp8/README.md b/examples/llm_compression/openvino/smollm_360m_fp8/README.md
@@ -0,0 +1,26 @@
+# Large Language Models FP8 Compression Example
+
+This example demonstrates how to optimize Large Language Models (LLMs) using NNCF quantize API. The example applies FP8 quantization to [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) model. This leads to a significant decrease in model footprint and performance improvement with OpenVINO.
+
+## Prerequisites
+
+To use this example:
+
+- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
+- Install dependencies:
+
+```bash
+pip install -U pip
+pip install -r requirements.txt
+pip install ../../../../
+```
+
+## Run Example
+
+To run example:
+
+```bash
+python main.py
+```
+
+It will automatically download the dataset and baseline model and save the resulting model.
diff --git a/examples/llm_compression/openvino/smollm_360m_fp8/main.py b/examples/llm_compression/openvino/smollm_360m_fp8/main.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+
+import datasets
+import numpy as np
+import openvino as ov
+from optimum.intel.openvino import OVModelForCausalLM
+from transformers import AutoTokenizer
+
+import nncf
+
+
+def main():
+    MODEL_ID = "HuggingFaceTB/SmolLM-360M"
+    OUTPUT_DIR = "smollm_360m_copmressed"
+
+    dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    # Filtering to remove empty samples from the dataset
+    dataset = dataset.filter(lambda example: len(example["text"]) > 1)
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False, stateful=False)
+
+    def transform_fn(data, model, tokenizer):
+        tokenized_text = tokenizer(data["text"], return_tensors="np")
+        input_ids = tokenized_text["input_ids"]
+        attention_mask = tokenized_text["attention_mask"]
+
+        inputs = {}
+        inputs["input_ids"] = input_ids
+        inputs["attention_mask"] = tokenized_text["attention_mask"]
+        position_ids = np.cumsum(attention_mask, axis=1) - 1
+        position_ids[attention_mask == 0] = 1
+
+        # The magic forms KV cache as model inputs
+        batch_size = input_ids.shape[0]
+        for input_name in model.key_value_input_names:
+            model_inputs = model.model.input(input_name)
+            shape = model_inputs.get_partial_shape()
+            shape[0] = batch_size
+            if shape[2].is_dynamic:
+                shape[2] = 0
+            else:
+                shape[1] = 0
+            inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
+
+        inputs["position_ids"] = position_ids
+        return inputs
+
+    quantization_dataset = nncf.Dataset(dataset, partial(transform_fn, model=model, tokenizer=tokenizer))
+
+    model.model = nncf.quantize(
+        model.model,
+        calibration_dataset=quantization_dataset,
+        # Only PERFORMANCE preset supports in combination with FP8 quantization mode
+        preset=nncf.QuantizationPreset.PERFORMANCE,
+        mode=nncf.QuantizationMode.FP8_E4M3,
+        model_type=nncf.ModelType.TRANSFORMER,
+        # SmoothQuant algorithm is not needed for FP8 quantization
+        advanced_parameters=nncf.AdvancedQuantizationParameters(
+            smooth_quant_alphas=nncf.AdvancedSmoothQuantParameters(matmul=-1)
+        ),
+    )
+    model.save_pretrained(OUTPUT_DIR)
+    tokenizer.save_pretrained(OUTPUT_DIR)
+
+    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"})
+    input_ids = tokenizer("What is Python?", return_tensors="pt").to(device=model.device)
+
+    output = model.generate(**input_ids, max_new_tokens=100)
+    output_text = tokenizer.decode(output[0])
+    print(f"Optimized model output: {output_text}\n")
+    return output_text
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt b/examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt
@@ -0,0 +1,4 @@
+datasets
+openvino==2024.4
+optimum-intel[openvino]>=1.13.0
+transformers
diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json
@@ -260,5 +260,13 @@
             "int8_model_size": 5.677968978881836,
             "model_compression_rate": 3.7654144877995197
         }
+    },
+    "fp8_llm_quantization": {
+        "backend": "openvino",
+        "requirements": "/home/nmali/work/nncf/examples/llm_compression/openvino/smollm_360m_fp8/requirements.txt",
+        "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
+        "accuracy_metrics": {
+            "word_count": 75
+        }
     }
 }
diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py
@@ -184,6 +184,14 @@ def llm_compression_synthetic() -> Dict[str, float]:
     return {"word_count": len(result.split())}
 
 
+def fp8_llm_quantization() -> Dict[str, float]:
+    from examples.llm_compression.openvino.smollm_360m_fp8.main import main as fp8_llm_quantization_main
+
+    result = fp8_llm_quantization_main()
+
+    return {"word_count": len(result.split())}
+
+
 def post_training_quantization_torch_fx_resnet18():
     from examples.post_training_quantization.torch_fx.resnet18.main import main as resnet18_main