diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json index e7e0cbda6..4ade34f75 100644 --- a/examples/.config/model_params_onnxrt.json +++ b/examples/.config/model_params_onnxrt.json @@ -1,60 +1,103 @@ { "onnxrt": { "llama-2-7b-rtn": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "RTN" }, "llama-2-7b-rtn-with-past": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "RTN" }, "llama-2-7b-awq": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "AWQ" }, "llama-2-7b-awq-with-past": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "AWQ" }, "llama-2-7b-gptq": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "GPTQ" }, "llama-2-7b-gptq-with-past": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "GPTQ" }, "llama-2-7b-woq_tune": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "WOQ_TUNE" }, "llama-2-7b-woq_tune-with-past": { - "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only", + "model_name": "meta-llama/Llama-2-7b-hf", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", "dataset_location": "", "input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past", "main_script": "main.py", - "batch_size": 1 + "batch_size": 1, + "algorithm": "WOQ_TUNE" + }, + "llama-3-8b-gptq-with-past": { + "model_name": "meta-llama/Meta-Llama-3-8B", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", + "dataset_location": "", + "input_model": "/tf_dataset2/models/onnx/Meta-Llama-3-8B-with-past", + "main_script": "main.py", + "batch_size": 1, + "algorithm": "GPTQ" + }, + "phi-3-mini-128k-instruct-rtn-with-past": { + "model_name": "microsoft/Phi-3-mini-128k-instruct", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", + "dataset_location": "", + "input_model": "/tf_dataset2/models/onnx/Phi-3-mini-128k-instruct-with-past", + "main_script": "main.py", + "batch_size": 1, + "algorithm": "RTN" + }, + "qwen2-7b-instruct-rtn-with-past": { + "model_name": "Qwen/Qwen2-7B-Instruct", + "model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only", + "dataset_location": "", + "input_model": "/tf_dataset2/models/onnx/Qwen2-7B-Instruct-with-past", + "main_script": "main.py", + "batch_size": 1, + "algorithm": "RTN" }, "bert_base_MRPC": { "model_src_dir": "nlp/bert/quantization/ptq_static", diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md similarity index 86% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md index 9ddbc7f2c..79b17c73f 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md +++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md @@ -14,7 +14,7 @@ pip install -r requirements.txt ## 2. Prepare Model -Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for weight-only quantization. The following table shows a few models' configurations: +Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. We verified weight-only quantization on other models as follows. | Model | Num Hidden Layers| Num Attention Heads | Hidden Size | | --- | --- | --- | --- | @@ -24,14 +24,17 @@ Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are | [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 | | [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 | | [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 | +| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 32 | 32 | 4096 | +| [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | 32 | 32 | 3072 | +| [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 28 | 28 | 3584 | Export to ONNX model: ```bash python prepare_model.py --input_model="meta-llama/Llama-2-7b-hf" \ - --output_model="./llama-2-7b-hf" \ --task=text-generation-with-past \ # or text-generation ``` + # Run ## 1. Quantization @@ -53,7 +56,7 @@ Accuracy: ```bash bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model - --batch_size=batch_size \ # optional + --batch_size=batch_size \ # optional --mode=accuracy \ --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer --tasks=lambada_openai diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/__init__.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/__init__.py similarity index 100% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/__init__.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/__init__.py diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/accuracy.py similarity index 100% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/accuracy.py diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/evaluator.py similarity index 100% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/evaluator.py diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/models/__init__.py similarity index 100% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/models/__init__.py diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/models/huggingface.py similarity index 100% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/models/huggingface.py diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/utils.py similarity index 100% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/utils.py diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py similarity index 92% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py index 37c971534..e327aa827 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py @@ -44,7 +44,7 @@ parser.add_argument("--model_path", type=str, help="Folder path of pre-trained onnx model") parser.add_argument("--benchmark", action="store_true", default=False) parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") -parser.add_argument("--output_model", type=str, default=None, help="output model path") +parser.add_argument("--output_model", type=str, default=None, help="path of output dircectory") parser.add_argument( "--batch_size", default=1, @@ -92,11 +92,27 @@ parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") parser.add_argument("--intra_op_num_threads", type=int, default=24) parser.add_argument("--trust_remote_code", type=bool, default=False) +parser.add_argument("--layer_wise", action="store_true", default=False) +parser.add_argument( + "--quantize_lm_head", + action="store_true", + default=False, + help="language modeling head will not be quantized by default. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'", +) +parser.add_argument( + "--nodes_to_exclude", + nargs="+", + default=[], + help="nodes that will not be quantized. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'", +) args = parser.parse_args() +if args.tune and not os.path.exists(args.output_model): + os.makedirs(args.output_model) + # load model -tokenizer = transformers.LlamaTokenizer.from_pretrained(args.tokenizer) -model_config = transformers.LlamaConfig.from_pretrained(args.model_path) +tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer) +model_config = transformers.AutoConfig.from_pretrained(args.model_path, trust_remote_code=args.trust_remote_code) def tokenize_function(examples): @@ -110,7 +126,8 @@ def replace_architectures(json_path): # refer to https://github.com/huggingface/transformers/issues/22222#issuecomment-1477171703 with open(json_path, "r") as file: data = json.load(file) - data["architectures"] = ["LlamaForCausalLM"] + if data["architectures"] == ["LLaMATokenizer"]: + data["architectures"] = ["LlamaForCausalLM"] with open(json_path, "w") as file: json.dump(data, file, indent=4) @@ -327,14 +344,18 @@ def rewind(self): model_name = "model.onnx" # require optimum >= 1.14.0 model_path = os.path.join(args.model_path, model_name) best_model = None + + nodes_to_exclude = ["/lm_head/MatMul"] if not args.quantize_lm_head else [] + nodes_to_exclude = list(set(args.nodes_to_exclude + nodes_to_exclude)) if args.algorithm.upper() == "RTN": - algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=True) + algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=args.layer_wise) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( model_path, n_bits=4, block_size=32, is_symmetric=True, algo_config=algo_config, + nodes_to_exclude=nodes_to_exclude, ) quant.process() best_model = quant.model @@ -350,6 +371,7 @@ def rewind(self): block_size=32, is_symmetric=True, algo_config=algo_config, + nodes_to_exclude=nodes_to_exclude, ) quant.process() best_model = quant.model @@ -357,7 +379,7 @@ def rewind(self): elif args.algorithm.upper() == "GPTQ": calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1) algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig( - calibration_data_reader=calibration_data_reader, layer_wise_quant=True + calibration_data_reader=calibration_data_reader, layer_wise_quant=args.layer_wise ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( model_path, @@ -365,6 +387,7 @@ def rewind(self): block_size=32, is_symmetric=False, algo_config=algo_config, + nodes_to_exclude=nodes_to_exclude, ) quant.process() best_model = quant.model diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/prepare_model.py similarity index 76% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/prepare_model.py index 3af820943..4d5d357da 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py +++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/prepare_model.py @@ -10,8 +10,8 @@ def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument("--input_model", type=str, required=False, default="") - parser.add_argument("--output_model", type=str, required=True) + parser.add_argument("--input_model", type=str, required=True, default="") + parser.add_argument("--output_model", type=str, required=False, default=None) parser.add_argument( "--task", type=str, @@ -19,7 +19,10 @@ def parse_arguments(): default="text-generation-with-past", choices=["text-generation-with-past", "text-generation"], ) - return parser.parse_args() + args = parser.parse_args() + if args.output_model is None: + args.output_model = os.path.basename(args.input_model) + "-onnx" + return args def prepare_model(input_model, output_model, task): @@ -37,6 +40,7 @@ def prepare_model(input_model, output_model, task): "--task", task, f"{output_model}", + "--trust-remote-code", ], stdout=subprocess.PIPE, text=True, diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/requirements.txt similarity index 100% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/requirements.txt diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh similarity index 86% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh index 1f728c0f1..72348427c 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh +++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh @@ -35,23 +35,27 @@ function init_params { # run_benchmark function run_benchmark { - + # Check if the input_model ends with the filename extension ".onnx" if [[ $input_model =~ \.onnx$ ]]; then # If the string ends with the filename extension, get the path of the file input_model=$(dirname "$input_model") fi - python main.py \ + if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then + extra_cmd="--trust_remote_code True" + fi + + eval "python main.py \ --model_path ${input_model} \ --batch_size=${batch_size-1} \ --tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \ --tasks=${tasks-lambada_openai} \ --mode=${mode} \ --intra_op_num_threads=${intra_op_num_threads-24} \ - --benchmark - + --benchmark \ + ${extra_cmd}" + } main "$@" - diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh similarity index 69% rename from examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh rename to examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh index 295b47249..4198da9a8 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh +++ b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh @@ -56,7 +56,20 @@ function run_tuning { echo "Created directory $output_model" fi - python main.py \ + if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then + nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul" + extra_cmd="--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True" + fi + if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then + nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul" + extra_cmd="--nodes_to_exclude ${nodes_to_exclude}" + fi + if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then + nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul" + extra_cmd="--nodes_to_exclude ${nodes_to_exclude}" + fi + + eval "python main.py \ --model_path ${input_model} \ --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \ --output_model ${output_model} \ @@ -64,8 +77,9 @@ function run_tuning { --dataset ${dataset-NeelNanda/pile-10k} \ --algorithm ${algorithm-WOQ_TUNE} \ --tasks ${tasks-lambada_openai} \ - --tune + --layer_wise \ + --tune \ + ${extra_cmd}" } main "$@" - diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py index fe07a3a77..80077a9be 100644 --- a/onnx_neural_compressor/algorithms/layer_wise/core.py +++ b/onnx_neural_compressor/algorithms/layer_wise/core.py @@ -66,16 +66,7 @@ def layer_wise_quant( # get and check split nodes split_nodes = origin_model.find_split_nodes() if len(split_nodes) == 0: - logger.error( - "Can't find split nodes for layer-wise quantization. " - "We recommend applying graph optimization for your model like follows: \n" - "import onnxruntime as ort \n" - "sess_options = ort.SessionOptions() \n" - "sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED " - "# or ORT_ENABLE_BASIC \n" - "sess_options.optimized_model_filepath = 'optimized_model_path' \n" - "ort.InferenceSession(infer_shape_model_path, sess_options)" - ) + logger.error("Can't find split nodes for layer-wise quantization.") raise ValueError("Fail to run layer-wise quantization.") logger.info( "Will split model into {} parts to do layer-wise quantization".format( diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py index fe51eac92..5488615e5 100644 --- a/onnx_neural_compressor/onnx_model.py +++ b/onnx_neural_compressor/onnx_model.py @@ -241,7 +241,7 @@ def save(self, root): root, save_as_external_data=True, all_tensors_to_one_file=True, - location=root.split("/")[-1] + "_data", + location=os.path.basename(root) + "_data", size_threshold=1024, convert_attribute=False, ) @@ -1001,7 +1001,7 @@ def _save_split_model(self, save_path): save_path, save_as_external_data=True, all_tensors_to_one_file=True, - location=save_path.split("/")[-1] + "_data", + location=os.path.basename(save_path) + "_data", size_threshold=1024, convert_attribute=False, )