From 977361bdbebde2aca7043db2245d6fb312bd1156 Mon Sep 17 00:00:00 2001 From: charlifu Date: Sat, 1 Jun 2024 00:47:27 +0000 Subject: [PATCH] doc fix --- ROCm_performance.md | 6 +++--- vllm/model_executor/layers/quantization/fp8_rocm.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ROCm_performance.md b/ROCm_performance.md index f46741f3f19aa..0f12ed1adc9af 100644 --- a/ROCm_performance.md +++ b/ROCm_performance.md @@ -21,9 +21,9 @@ The custom PagedAttention kernel is enabled for dtype: fp16, block-size=16, head ## Fp8 Quantization -To use fp8 quantization, first step is to use Nvidia ammo to quantize your model to fp8 format, following this [instruction](https://github.com/vllm-project/vllm/blob/main/examples/fp8/quantizer/README.md). This will give a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. We will need to put the safetensor file under your model folder, and add file called `serenity_config.json`, which contains a json object with a key: `"quantized_weights": "quantized/osf/rank0.safetensors"`, the value should be the releative path of your safetensor file containing the quantized weights. +To use fp8 quantization, first step is to quantize your model to fp8 format. Generating a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be added under your model folder along with a file called `serenity_config.json`, which contains a json object with a key: `"quantized_weights": "quantized/osf/rank0.safetensors"`, the value should be the relative path of your safetensor file containing the quantized weights. -Then we can run a model with fp8 quantization using vllm, just add a parameter `quantization="fp8"` when creating the vllm.LLM object. +Then we can run a model with fp8 quantization using vllm, just add a parameter `quantization="fp8"` when creating the `vllm.LLM` object. ## Gemm Tunning for Fp8 @@ -37,7 +37,7 @@ Next, run gradlib to obtain the best solutions of these shapes: cd gradlib_fp8 python3 -m pip uninstall gradlib python3 setup.py install -python3 gemm_tunner.py --input_file /fp8_shapes.csv --tuned_file /tuned_fp8_16.csv +python3 gemm_tuner.py --input_file /fp8_shapes.csv --tuned_file /tuned_fp8_16.csv cd ../gradlib python3 -m pip uninstall gradlib python3 setup.py install diff --git a/vllm/model_executor/layers/quantization/fp8_rocm.py b/vllm/model_executor/layers/quantization/fp8_rocm.py index d39c074b97325..55642ae23ac6d 100644 --- a/vllm/model_executor/layers/quantization/fp8_rocm.py +++ b/vllm/model_executor/layers/quantization/fp8_rocm.py @@ -59,7 +59,7 @@ def get_min_capability(cls) -> int: @classmethod def get_name(cls) -> str: - return "serenity" + return "Fp8Rocm" def get_linear_method(self) -> "Fp8RocmLinearLayer": return Fp8RocmLinearLayer(self)