-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Mengni Wang <[email protected]>
- Loading branch information
1 parent
27eae66
commit 9083a19
Showing
6 changed files
with
86 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
from packaging.version import Version | ||
|
||
from onnx_neural_compressor import constants, data_reader, onnx_model, utility | ||
from onnx_neural_compressor.algorithms.weight_only import rtn | ||
from onnx_neural_compressor.algorithms import utility as quant_utils | ||
from onnx_neural_compressor.algorithms.layer_wise import core | ||
from onnx_neural_compressor.quantization import config | ||
|
@@ -301,13 +302,13 @@ def gptq_quantize( | |
weight, | ||
H, | ||
) in zip(node_list, weights, Hs): | ||
weight_dtype = weight_config[node.name].get("weight_dtype", "int") | ||
num_bits = weight_config[node.name].get("weight_bits", 4) | ||
group_size = weight_config[node.name].get("weight_group_size", 32) | ||
sym = weight_config[node.name].get("weight_sym", True) | ||
accuracy_level = weight_config[node.name].get("accuracy_level", 0) | ||
group_size = group_size if group_size != -1 else weight.shape[0] | ||
dtype = weight.dtype | ||
|
||
weight_tensor = model.get_initializer(node.input[1]) | ||
init_share_num = model.get_initializer_share_num(node.input[1]) | ||
|
||
# weight -> quant -> dequant -> q_weight | ||
q_weight = _gptq( | ||
|
@@ -322,88 +323,30 @@ def gptq_quantize( | |
mse=mse, | ||
perchannel=perchannel, | ||
) | ||
|
||
weight_tensor = model.get_initializer(node.input[1]) | ||
org_shape = weight.shape | ||
init_share_num = model.get_initializer_share_num(node.input[1]) | ||
|
||
satisfy_MatMulNBits_condition = ort_version > constants.ONNXRT1161_VERSION and num_bits == 4 | ||
satisfy_MatMulFpQ4_condition = ( | ||
ort_version >= constants.ONNXRT116_VERSION and num_bits == 4 and group_size == 32 | ||
) | ||
if ( | ||
quant_format == 1 # QDQ format | ||
and num_bits in [4, 8] | ||
and ort_version >= constants.ONNXRT119_VERSION | ||
and model.opset_import[0].version > 20 | ||
): | ||
_, _, zp, scale, q_weight = quant_utils.quantize_data( | ||
weight.T.reshape((-1, group_size)), | ||
weight_dtype + str(num_bits), | ||
sym, | ||
axis=1, | ||
) | ||
dequant_node, new_inits = quant_utils.make_weight_only_dequant_node( | ||
node=node, | ||
weight_shape=org_shape, | ||
num_bits=num_bits, | ||
dtype=weight_dtype, | ||
q_weight=q_weight, | ||
scale=scale.astype(weight.dtype), | ||
axis=0, | ||
block_size=group_size, | ||
zero_point=zp, | ||
) | ||
model.add_initializers(new_inits) | ||
model.add_node(dequant_node) | ||
node.name += "_Q" | ||
elif ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or ( | ||
"CUDAExecutionProvider" not in providers | ||
and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition) | ||
): | ||
# MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP | ||
# MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP | ||
k_blocks = (org_shape[0] + group_size - 1) // group_size | ||
q_weight = quant_utils.pad_tensor(q_weight, group_size, k_blocks) | ||
_, _, zp, scale, q_weight = quant_utils.quantize_data( | ||
q_weight.T.reshape((-1, group_size)), | ||
weight_dtype + str(num_bits), | ||
sym, | ||
axis=1, | ||
) | ||
q_matmul_node, new_inits = quant_utils.make_matmul_weight_only_node( | ||
node=node, | ||
weight_shape=org_shape, | ||
num_bits=num_bits, | ||
group_size=group_size, | ||
k_blocks=k_blocks, | ||
q_weight=q_weight, | ||
scale=scale.astype(dtype), | ||
zero_point=zp if not sym else None, | ||
accuracy_level=accuracy_level, | ||
) | ||
|
||
model.add_initializers(new_inits) | ||
model.remove_node(node) | ||
model.add_node(q_matmul_node) | ||
if init_share_num == 1: | ||
model.set_initializer(weight_tensor, q_weight) | ||
else: | ||
q_weight_tensor = onnx.helper.make_tensor( | ||
name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)), | ||
data_type=onnx.helper.np_dtype_to_tensor_dtype(dtype), | ||
dims=q_weight.shape, | ||
vals=q_weight.astype(dtype).tobytes(), | ||
new_init = onnx.helper.make_tensor( | ||
name=node.input[1] + "_GPTQ", | ||
data_type=weight_tensor.data_type, | ||
dims=weight_tensor.dims, | ||
vals=array.flatten().tolist().tobytes(), | ||
Check failure Code scanning / lintrunner RUFF/F821 Error
Undefined name array.
See https://docs.astral.sh/ruff/rules/undefined-name |
||
raw=True, | ||
) | ||
model.add_initializer(q_weight_tensor) | ||
node.input[1] = q_weight_tensor.name | ||
if init_share_num == 1: | ||
model.remove_initializer(weight_tensor) | ||
node.input[0] = new_init.name | ||
model.add_initializer(new_init) | ||
|
||
model.model = rtn.rtn_quantize( | ||
model=model, | ||
weight_config=weight_config, | ||
ratios=full_ratio, | ||
Check failure Code scanning / lintrunner RUFF/F821 Error
Undefined name full\_ratio.
See https://docs.astral.sh/ruff/rules/undefined-name |
||
providers=providers, | ||
quant_format=quant_format, | ||
) | ||
|
||
model.remove_tensors_from_outputs(output_names) | ||
model.model.graph.output.MergeFrom(org_output) | ||
|
||
model.topological_sort() | ||
|
||
# reload external data to prevent external data file path errors | ||
if model.is_large_model: | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,34 @@ | |
from onnx_neural_compressor.algorithms.post_training_quant import calibrate, quantizer | ||
from onnx_neural_compressor.algorithms.smoother import core | ||
from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn | ||
from onnx_neural_compressor.quantization import config | ||
from onnx_neural_compressor.quantization import QuantFormat, config | ||
|
||
ort_version = version.Version(ort.__version__) | ||
Check failure Code scanning / lintrunner RUFF/F821 Error
Undefined name version.
See https://docs.astral.sh/ruff/rules/undefined-name |
||
|
||
|
||
def _update_quant_format(algorithm, model, quant_config): | ||
if isinstance(model, str): | ||
model = onnx.load(model, load_external_data=False) | ||
quant_format = getattr(quant_config.quant_format, "value", quant_format) | ||
Check failure Code scanning / lintrunner RUFF/F821 Error
Undefined name quant\_format.
See https://docs.astral.sh/ruff/rules/undefined-name |
||
if algorithm in [constants.RTN, constants.AWQ, constants.GPTQ]: | ||
if quant_config.weight_bits not in [4, 8] and quant_format == 1: | ||
logger.warning( | ||
"QDQ format only support 4 and 8 bits now, but get {} bits." | ||
"Use QOperator format instead.".format(quant_config.weight_bits) | ||
) | ||
quant_config.quant_format = QuantFormat.QOperator | ||
elif ( | ||
quant_config.weight_bits == 4 | ||
and (ort_version < constants.ONNXRT119_VERSION or model.opset_import[0].version < 21) | ||
and quant_format == 1 | ||
): | ||
logger.warning( | ||
"QDQ format for 4 bits tensor requires onnxruntime >= 1.19.0 and the opset version of model > 20, " | ||
"but get onnxruntime version is {}, opset version is {}. Use QOperator format instead.".format( | ||
ort_version, model.opset_import[0].version | ||
) | ||
) | ||
quant_config.quant_format = QuantFormat.QOperator | ||
|
||
|
||
###################### RTN Algo Entry ################################## | ||
|
@@ -32,13 +59,16 @@ def rtn_quantize_entry( | |
model: Union[pathlib.Path, str], quant_config: config.RTNConfig, *args, **kwargs | ||
) -> onnx.ModelProto: | ||
"""The main entry to apply rtn quantization.""" | ||
_update_quant_format(constants.RTN, model, quant_config) | ||
|
||
if len(quant_config.config_mapping) == 0: | ||
# map config to each op | ||
model_info = config.RTNConfig.get_model_info(model=model) | ||
config_mapping = quant_config.to_config_mapping(model_info=model_info) | ||
logger.debug(config_mapping) | ||
else: | ||
config_mapping = quant_config.config_mapping | ||
|
||
quant_kwargs = {} | ||
for key in config.RTNConfig.model_params_list: | ||
val = getattr(quant_config, key) | ||
|
@@ -62,6 +92,8 @@ def gptq_quantize_entry( | |
calibration_data_reader, data_reader.CalibrationDataReader | ||
), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader" | ||
|
||
_update_quant_format(constants.GPTQ, model, quant_config) | ||
|
||
if len(quant_config.config_mapping) == 0: | ||
# map config to each op | ||
model_info = config.GPTQConfig.get_model_info(model=model) | ||
|
@@ -96,6 +128,8 @@ def awq_quantize_entry( | |
calibration_data_reader, data_reader.CalibrationDataReader | ||
), "Please follow onnx_neural_compressor/data_reader.py to implement calibration_data_reader" | ||
|
||
_update_quant_format(constants.AWQ, model, quant_config) | ||
|
||
if len(quant_config.config_mapping) == 0: | ||
# map config to each op | ||
model_info = config.AWQConfig.get_model_info(model=model) | ||
|