huggingface · HolyFalafel · Jul 3, 2024 · Jul 18, 2024 · Jul 18, 2024 · libinta
@@ -494,6 +494,37 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 
 For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#using-fused-sdpa).
 
+### Running with UINT4 weight quantization using AutoGPTQ
+
+
+Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ Fork](https://github.com/HabanaAI/AutoGPTQ), which provides quantization capabilities in PyTorch.
+Currently, the support is for UINT4 inference of pre-quantized models only.
+
+You can run a *UINT4 weight quantized* model using AutoGPTQ by setting the following environment variables:
+`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=true` before running the command,
+and by adding the argument `--gptq`.
+
+***Note:***
+Setting the above environment variables improves performance. These variables will be removed in future releases.
+
+
+Here is an example to run a quantized model on Llama2-7b `TheBloke/Llama-2-7b-Chat-GPTQ`:
+```bash
+SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false \
+ENABLE_EXPERIMENTAL_FLAGS=true python run_generation.py \
+--attn_softmax_bf16 \
+--model_name_or_path TheBloke/Llama-2-7b-Chat-GPTQ \
+--use_hpu_graphs \
+--limit_hpu_graphs \
+--use_kv_cache \
+--bucket_size 128 \
+--bucket_internal \
+--trim_logits \
+--max_new_tokens 128 \
+--batch_size 1 \
+--bf16 \
+--gptq
+```
 
 ## Language Model Evaluation Harness
 

@@ -59,6 +59,9 @@
             ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 39.26845661768185),
             ("microsoft/phi-2", 1, 1, True, 128, 128, 254.08932787178165),
         ],
+        "gptq": [
+            ("TheBloke/Llama-2-7b-Chat-GPTQ", 1, 10, False, 128, 2048, 456.7),
+        ],
         "deepspeed": [
             ("bigscience/bloomz", 36.77314954096159),
             ("meta-llama/Llama-2-70b-hf", 64.10514998902435),
@@ -96,6 +99,7 @@
             ("state-spaces/mamba-130m-hf", 224, False, 794.542),
         ],
         "fp8": [],
+        "gptq": [],
         "deepspeed": [
             ("bigscience/bloomz-7b1", 31.994268212011505),
         ],
@@ -114,6 +118,7 @@ def _test_text_generation(
     world_size: int = 8,
     torch_compile: bool = False,
     fp8: bool = False,
+    gptq: bool = False,
     max_input_tokens: int = 0,
     max_output_tokens: int = 100,
 ):
@@ -188,6 +193,8 @@ def _test_text_generation(
             f"--max_input_tokens {max_input_tokens}",
             "--limit_hpu_graphs",
         ]
+    if gptq:
+        command += ["--gptq"]
 
     with TemporaryDirectory() as tmp_dir:
         command.append(f"--output_dir {tmp_dir}")
@@ -272,6 +279,35 @@ def test_text_generation_fp8(
     )
 
 
+@pytest.mark.parametrize(
+    "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline", MODELS_TO_TEST["gptq"]
+)
+def test_text_generation_gptq(
+    model_name: str,
+    baseline: float,
+    world_size: int,
+    batch_size: int,
+    reuse_cache: bool,
+    input_len: int,
+    output_len: int,
+    token: str,
+):
+    deepspeed = True if world_size > 1 else False
+    _test_text_generation(
+        model_name,
+        baseline,
+        token,
+        deepspeed=deepspeed,
+        world_size=world_size,
+        fp8=False,
+        gptq=True,
+        batch_size=batch_size,
+        reuse_cache=reuse_cache,
+        max_input_tokens=input_len,
+        max_output_tokens=output_len,
+    )
+
+
 @pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["deepspeed"])
 def test_text_generation_deepspeed(model_name: str, baseline: float, token: str):
     world_size = 2 if "opt-66b" in model_name else 8