Skip to content

Commit

Permalink
fix acc issue and refine code
Browse files Browse the repository at this point in the history
Signed-off-by: Mengni Wang <[email protected]>
  • Loading branch information
mengniwang95 committed Sep 13, 2024
1 parent 27eae66 commit 9083a19
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 82 deletions.
27 changes: 27 additions & 0 deletions examples/.config/model_params_onnxrt.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@
"batch_size": 1,
"algorithm": "RTN"
},
"llama-2-7b-rtn-with-past-qdq": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "RTN"
},
"llama-2-7b-awq": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
Expand All @@ -36,6 +45,15 @@
"batch_size": 1,
"algorithm": "AWQ"
},
"llama-2-7b-awq-with-past-qdq": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "AWQ"
},
"llama-2-7b-gptq": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
Expand All @@ -54,6 +72,15 @@
"batch_size": 1,
"algorithm": "GPTQ"
},
"llama-2-7b-gptq-with-past-qdq": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "GPTQ"
},
"llama-2-7b-woq_tune": {
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
--dataset=NeelNanda/pile-10k \
--tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
--algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ \
--quant_format=QOperator # support QOperator and QDQ
--quant_format=QDQ # support QOperator and QDQ
```

## 2. Benchmark
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
default=[],
help="nodes that will not be quantized. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
)
parser.add_argument("--quant_format", type=str, default="QOperator", choices=["QOperator", "QDQ"])
parser.add_argument("--quant_format", type=str, default="QDQ", choices=["QOperator", "QDQ"])
args = parser.parse_args()

if args.tune and not os.path.exists(args.output_model):
Expand Down
2 changes: 1 addition & 1 deletion onnx_neural_compressor/algorithms/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def make_weight_only_dequant_node(
input_names = []
kwargs = {"block_size": block_size, "axis": axis}

q_weight = q_weight.reshape((-1, weight_shape[-1])).T
q_weight = q_weight.reshape((weight_shape[-1], -1)).T
if num_bits == 4:
q_weight = ((q_weight[:, ::2] & 0xF | q_weight[:, 1::2] << 4) & 0xFF).astype("uint8")

Expand Down
99 changes: 21 additions & 78 deletions onnx_neural_compressor/algorithms/weight_only/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from packaging.version import Version

from onnx_neural_compressor import constants, data_reader, onnx_model, utility
from onnx_neural_compressor.algorithms.weight_only import rtn
from onnx_neural_compressor.algorithms import utility as quant_utils
from onnx_neural_compressor.algorithms.layer_wise import core
from onnx_neural_compressor.quantization import config
Expand Down Expand Up @@ -301,13 +302,13 @@ def gptq_quantize(
weight,
H,
) in zip(node_list, weights, Hs):
weight_dtype = weight_config[node.name].get("weight_dtype", "int")
num_bits = weight_config[node.name].get("weight_bits", 4)
group_size = weight_config[node.name].get("weight_group_size", 32)
sym = weight_config[node.name].get("weight_sym", True)
accuracy_level = weight_config[node.name].get("accuracy_level", 0)
group_size = group_size if group_size != -1 else weight.shape[0]
dtype = weight.dtype

weight_tensor = model.get_initializer(node.input[1])
init_share_num = model.get_initializer_share_num(node.input[1])

# weight -> quant -> dequant -> q_weight
q_weight = _gptq(
Expand All @@ -322,88 +323,30 @@ def gptq_quantize(
mse=mse,
perchannel=perchannel,
)

weight_tensor = model.get_initializer(node.input[1])
org_shape = weight.shape
init_share_num = model.get_initializer_share_num(node.input[1])

satisfy_MatMulNBits_condition = ort_version > constants.ONNXRT1161_VERSION and num_bits == 4
satisfy_MatMulFpQ4_condition = (
ort_version >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32
)
if (
quant_format == 1 # QDQ format
and num_bits in [4, 8]
and ort_version >= constants.ONNXRT119_VERSION
and model.opset_import[0].version > 20
):
_, _, zp, scale, q_weight = quant_utils.quantize_data(
weight.T.reshape((-1, group_size)),
weight_dtype + str(num_bits),
sym,
axis=1,
)
dequant_node, new_inits = quant_utils.make_weight_only_dequant_node(
node=node,
weight_shape=org_shape,
num_bits=num_bits,
dtype=weight_dtype,
q_weight=q_weight,
scale=scale.astype(weight.dtype),
axis=0,
block_size=group_size,
zero_point=zp,
)
model.add_initializers(new_inits)
model.add_node(dequant_node)
node.name += "_Q"
elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
"CUDAExecutionProvider" not in providers
and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
):
# MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
# MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
k_blocks = (org_shape[0] + group_size - 1) // group_size
q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks)
_, _, zp, scale, q_weight = quant_utils.quantize_data(
q_weight.T.reshape((-1, group_size)),
weight_dtype + str(num_bits),
sym,
axis=1,
)
q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node(
node=node,
weight_shape=org_shape,
num_bits=num_bits,
group_size=group_size,
k_blocks=k_blocks,
q_weight=q_weight,
scale=scale.astype(dtype),
zero_point=zp if not sym else None,
accuracy_level=accuracy_level,
)

model.add_initializers(new_inits)
model.remove_node(node)
model.add_node(q_matmul_node)
if init_share_num == 1:
model.set_initializer(weight_tensor, q_weight)
else:
q_weight_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype),
dims=q_weight.shape,
vals=q_weight.astype(dtype).tobytes(),
new_init = onnx.helper.make_tensor(
name=node.input[1] + "_GPTQ",
data_type=weight_tensor.data_type,
dims=weight_tensor.dims,
vals=array.flatten().tolist().tobytes(),

Check failure

Code scanning / lintrunner

RUFF/F821 Error

raw=True,
)
model.add_initializer(q_weight_tensor)
node.input[1] = q_weight_tensor.name
if init_share_num == 1:
model.remove_initializer(weight_tensor)
node.input[0] = new_init.name
model.add_initializer(new_init)

model.model = rtn.rtn_quantize(
model=model,
weight_config=weight_config,
ratios=full_ratio,

Check failure

Code scanning / lintrunner

RUFF/F821 Error

Undefined name full\_ratio.
See https://docs.astral.sh/ruff/rules/undefined-name
providers=providers,
quant_format=quant_format,
)

model.remove_tensors_from_outputs(output_names)
model.model.graph.output.MergeFrom(org_output)

model.topological_sort()

# reload external data to prevent external data file path errors
if model.is_large_model:

Expand Down
36 changes: 35 additions & 1 deletion onnx_neural_compressor/quantization/algorithm_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,34 @@
from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer
from onnx_neural_compressor.algorithms.smoother import core
from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn
from onnx_neural_compressor.quantization import config
from onnx_neural_compressor.quantization import QuantFormat, config

ort_version = version.Version(ort.__version__)

Check failure

Code scanning / lintrunner

RUFF/F821 Error



def _update_quant_format(algorithm, model, quant_config):
if isinstance(model, str):
model = onnx.load(model, load_external_data=False)
quant_format = getattr(quant_config.quant_format, "value", quant_format)

Check failure

Code scanning / lintrunner

RUFF/F821 Error

Undefined name quant\_format.
See https://docs.astral.sh/ruff/rules/undefined-name
if algorithm in [constants.RTN, constants.AWQ, constants.GPTQ]:
if quant_config.weight_bits not in [4, 8] and quant_format == 1:
logger.warning(
"QDQ format only support 4 and 8 bits now, but get {} bits."
"Use QOperator format instead.".format(quant_config.weight_bits)
)
quant_config.quant_format = QuantFormat.QOperator
elif (
quant_config.weight_bits == 4
and (ort_version < constants.ONNXRT119_VERSION or model.opset_import[0].version < 21)
and quant_format == 1
):
logger.warning(
"QDQ format for 4 bits tensor requires onnxruntime >= 1.19.0 and the opset version of model > 20, "
"but get onnxruntime version is {}, opset version is {}. Use QOperator format instead.".format(
ort_version, model.opset_import[0].version
)
)
quant_config.quant_format = QuantFormat.QOperator


###################### RTN Algo Entry ##################################
Expand All @@ -32,13 +59,16 @@ def rtn_quantize_entry(
model: Union[pathlib.Path, str], quant_config: config.RTNConfig, *args, **kwargs
) -> onnx.ModelProto:
"""The main entry to apply rtn quantization."""
_update_quant_format(constants.RTN, model, quant_config)

if len(quant_config.config_mapping) == 0:
# map config to each op
model_info = config.RTNConfig.get_model_info(model=model)
config_mapping = quant_config.to_config_mapping(model_info=model_info)
logger.debug(config_mapping)
else:
config_mapping = quant_config.config_mapping

quant_kwargs = {}
for key in config.RTNConfig.model_params_list:
val = getattr(quant_config, key)
Expand All @@ -62,6 +92,8 @@ def gptq_quantize_entry(
calibration_data_reader, data_reader.CalibrationDataReader
), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"

_update_quant_format(constants.GPTQ, model, quant_config)

if len(quant_config.config_mapping) == 0:
# map config to each op
model_info = config.GPTQConfig.get_model_info(model=model)
Expand Down Expand Up @@ -96,6 +128,8 @@ def awq_quantize_entry(
calibration_data_reader, data_reader.CalibrationDataReader
), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader"

_update_quant_format(constants.AWQ, model, quant_config)

if len(quant_config.config_mapping) == 0:
# map config to each op
model_info = config.AWQConfig.get_model_info(model=model)
Expand Down

0 comments on commit 9083a19

Please sign in to comment.