Skip to content

Commit

Permalink
Enable more LLM examples (#28)
Browse files Browse the repository at this point in the history
Signed-off-by: Mengni Wang <[email protected]>
Signed-off-by: yuwenzho <[email protected]>
  • Loading branch information
yuwenzho authored Jul 26, 2024
1 parent d326f90 commit 5460cf1
Show file tree
Hide file tree
Showing 15 changed files with 130 additions and 48 deletions.
75 changes: 59 additions & 16 deletions examples/.config/model_params_onnxrt.json
Original file line number Diff line number Diff line change
@@ -1,60 +1,103 @@
{
"onnxrt": {
"llama-2-7b-rtn": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "RTN"
},
"llama-2-7b-rtn-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "RTN"
},
"llama-2-7b-awq": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "AWQ"
},
"llama-2-7b-awq-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "AWQ"
},
"llama-2-7b-gptq": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "GPTQ"
},
"llama-2-7b-gptq-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "GPTQ"
},
"llama-2-7b-woq_tune": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "WOQ_TUNE"
},
"llama-2-7b-woq_tune-with-past": {
"model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
"model_name": "meta-llama/Llama-2-7b-hf",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past",
"main_script": "main.py",
"batch_size": 1
"batch_size": 1,
"algorithm": "WOQ_TUNE"
},
"llama-3-8b-gptq-with-past": {
"model_name": "meta-llama/Meta-Llama-3-8B",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Meta-Llama-3-8B-with-past",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "GPTQ"
},
"phi-3-mini-128k-instruct-rtn-with-past": {
"model_name": "microsoft/Phi-3-mini-128k-instruct",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Phi-3-mini-128k-instruct-with-past",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "RTN"
},
"qwen2-7b-instruct-rtn-with-past": {
"model_name": "Qwen/Qwen2-7B-Instruct",
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
"dataset_location": "",
"input_model": "/tf_dataset2/models/onnx/Qwen2-7B-Instruct-with-past",
"main_script": "main.py",
"batch_size": 1,
"algorithm": "RTN"
},
"bert_base_MRPC": {
"model_src_dir": "nlp/bert/quantization/ptq_static",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pip install -r requirements.txt
## 2. Prepare Model

Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for weight-only quantization. The following table shows a few models' configurations:
Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. We verified weight-only quantization on other models as follows.

| Model | Num Hidden Layers| Num Attention Heads | Hidden Size |
| --- | --- | --- | --- |
Expand All @@ -24,14 +24,17 @@ Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are
| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 |
| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 |
| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 |
| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 32 | 32 | 4096 |
| [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | 32 | 32 | 3072 |
| [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 28 | 28 | 3584 |

Export to ONNX model:
```bash
python prepare_model.py --input_model="meta-llama/Llama-2-7b-hf" \
--output_model="./llama-2-7b-hf" \
--task=text-generation-with-past \ # or text-generation
```


# Run

## 1. Quantization
Expand All @@ -53,7 +56,7 @@ Accuracy:

```bash
bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
--batch_size=batch_size \ # optional
--batch_size=batch_size \ # optional
--mode=accuracy \
--tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
--tasks=lambada_openai
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
parser.add_argument("--model_path", type=str, help="Folder path of pre-trained onnx model")
parser.add_argument("--benchmark", action="store_true", default=False)
parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model")
parser.add_argument("--output_model", type=str, default=None, help="output model path")
parser.add_argument("--output_model", type=str, default=None, help="path of output dircectory")
parser.add_argument(
"--batch_size",
default=1,
Expand Down Expand Up @@ -92,11 +92,27 @@
parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy")
parser.add_argument("--intra_op_num_threads", type=int, default=24)
parser.add_argument("--trust_remote_code", type=bool, default=False)
parser.add_argument("--layer_wise", action="store_true", default=False)
parser.add_argument(
"--quantize_lm_head",
action="store_true",
default=False,
help="language modeling head will not be quantized by default. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
)
parser.add_argument(
"--nodes_to_exclude",
nargs="+",
default=[],
help="nodes that will not be quantized. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
)
args = parser.parse_args()

if args.tune and not os.path.exists(args.output_model):
os.makedirs(args.output_model)

# load model
tokenizer = transformers.LlamaTokenizer.from_pretrained(args.tokenizer)
model_config = transformers.LlamaConfig.from_pretrained(args.model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer)
model_config = transformers.AutoConfig.from_pretrained(args.model_path, trust_remote_code=args.trust_remote_code)


def tokenize_function(examples):
Expand All @@ -110,7 +126,8 @@ def replace_architectures(json_path):
# refer to https://github.com/huggingface/transformers/issues/22222#issuecomment-1477171703
with open(json_path, "r") as file:
data = json.load(file)
data["architectures"] = ["LlamaForCausalLM"]
if data["architectures"] == ["LLaMATokenizer"]:
data["architectures"] = ["LlamaForCausalLM"]

with open(json_path, "w") as file:
json.dump(data, file, indent=4)
Expand Down Expand Up @@ -327,14 +344,18 @@ def rewind(self):
model_name = "model.onnx" # require optimum >= 1.14.0
model_path = os.path.join(args.model_path, model_name)
best_model = None

nodes_to_exclude = ["/lm_head/MatMul"] if not args.quantize_lm_head else []
nodes_to_exclude = list(set(args.nodes_to_exclude + nodes_to_exclude))
if args.algorithm.upper() == "RTN":
algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=True)
algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=args.layer_wise)
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
model_path,
n_bits=4,
block_size=32,
is_symmetric=True,
algo_config=algo_config,
nodes_to_exclude=nodes_to_exclude,
)
quant.process()
best_model = quant.model
Expand All @@ -350,21 +371,23 @@ def rewind(self):
block_size=32,
is_symmetric=True,
algo_config=algo_config,
nodes_to_exclude=nodes_to_exclude,
)
quant.process()
best_model = quant.model

elif args.algorithm.upper() == "GPTQ":
calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig(
calibration_data_reader=calibration_data_reader, layer_wise_quant=True
calibration_data_reader=calibration_data_reader, layer_wise_quant=args.layer_wise
)
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
model_path,
n_bits=4,
block_size=32,
is_symmetric=False,
algo_config=algo_config,
nodes_to_exclude=nodes_to_exclude,
)
quant.process()
best_model = quant.model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@

def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--input_model", type=str, required=False, default="")
parser.add_argument("--output_model", type=str, required=True)
parser.add_argument("--input_model", type=str, required=True, default="")
parser.add_argument("--output_model", type=str, required=False, default=None)
parser.add_argument(
"--task",
type=str,
required=False,
default="text-generation-with-past",
choices=["text-generation-with-past", "text-generation"],
)
return parser.parse_args()
args = parser.parse_args()
if args.output_model is None:
args.output_model = os.path.basename(args.input_model) + "-onnx"
return args


def prepare_model(input_model, output_model, task):
Expand All @@ -37,6 +40,7 @@ def prepare_model(input_model, output_model, task):
"--task",
task,
f"{output_model}",
"--trust-remote-code",
],
stdout=subprocess.PIPE,
text=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,27 @@ function init_params {

# run_benchmark
function run_benchmark {

# Check if the input_model ends with the filename extension ".onnx"
if [[ $input_model =~ \.onnx$ ]]; then
# If the string ends with the filename extension, get the path of the file
input_model=$(dirname "$input_model")
fi

python main.py \
if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
extra_cmd="--trust_remote_code True"
fi

eval "python main.py \
--model_path ${input_model} \
--batch_size=${batch_size-1} \
--tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \
--tasks=${tasks-lambada_openai} \
--mode=${mode} \
--intra_op_num_threads=${intra_op_num_threads-24} \
--benchmark

--benchmark \
${extra_cmd}"

}

main "$@"

Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,30 @@ function run_tuning {
echo "Created directory $output_model"
fi

python main.py \
if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul"
extra_cmd="--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True"
fi
if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then
nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul"
extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
fi
if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then
nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul"
extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
fi

eval "python main.py \
--model_path ${input_model} \
--tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
--output_model ${output_model} \
--batch_size ${batch_size-1} \
--dataset ${dataset-NeelNanda/pile-10k} \
--algorithm ${algorithm-WOQ_TUNE} \
--tasks ${tasks-lambada_openai} \
--tune
--layer_wise \
--tune \
${extra_cmd}"
}

main "$@"

11 changes: 1 addition & 10 deletions onnx_neural_compressor/algorithms/layer_wise/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,7 @@ def layer_wise_quant(
# get and check split nodes
split_nodes = origin_model.find_split_nodes()
if len(split_nodes) == 0:
logger.error(
"Can't find split nodes for layer-wise quantization. "
"We recommend applying graph optimization for your model like follows: \n"
"import onnxruntime as ort \n"
"sess_options = ort.SessionOptions() \n"
"sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED "
"# or ORT_ENABLE_BASIC \n"
"sess_options.optimized_model_filepath = 'optimized_model_path' \n"
"ort.InferenceSession(infer_shape_model_path, sess_options)"
)
logger.error("Can't find split nodes for layer-wise quantization.")
raise ValueError("Fail to run layer-wise quantization.")
logger.info(
"Will split model into {} parts to do layer-wise quantization".format(
Expand Down
4 changes: 2 additions & 2 deletions onnx_neural_compressor/onnx_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def save(self, root):
root,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=root.split("/")[-1] + "_data",
location=os.path.basename(root) + "_data",
size_threshold=1024,
convert_attribute=False,
)
Expand Down Expand Up @@ -1001,7 +1001,7 @@ def _save_split_model(self, save_path):
save_path,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=save_path.split("/")[-1] + "_data",
location=os.path.basename(save_path) + "_data",
size_threshold=1024,
convert_attribute=False,
)
Expand Down

0 comments on commit 5460cf1

Please sign in to comment.