simplify config and fix ut

Signed-off-by: Mengni Wang <[email protected]>
onnx · Aug 16, 2024 · 04625d0 · 04625d0
1 parent 13b69e3
commit 04625d0
Show file tree

Hide file tree

Showing 10 changed files with 137 additions and 294 deletions.
diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py
@@ -74,7 +74,8 @@
 parser.add_argument(
     "--tasks",
     nargs="+",
-    default=[
+    default=["lambada_openai"],
+    choices=[
         "winogrande",
         "copa",
         "piqa",

diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh
@@ -14,19 +14,19 @@ function init_params {
   do
     case $var in
       --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
+          input_model=$(echo "$var" |cut -f2 -d=)
       ;;
       --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
+          batch_size=$(echo "$var" |cut -f2 -d=)
       ;;
       --tokenizer=*)
-          tokenizer=$(echo $var |cut -f2 -d=)
+          tokenizer=$(echo "$var" |cut -f2 -d=)
       ;;
       --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
+          mode=$(echo "$var" |cut -f2 -d=)
       ;;
       --intra_op_num_threads=*)
-          intra_op_num_threads=$(echo $var |cut -f2 -d=)
+          intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
       ;;
     esac
   done
@@ -42,19 +42,27 @@ function run_benchmark {
         input_model=$(dirname "$input_model")
     fi
 
+    extra_cmd=""
+
     if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
-        extra_cmd="--trust_remote_code True"
+        extra_cmd=$extra_cmd"--trust_remote_code True "
+    fi
+
+    if [ "${batch_size}" ]; then
+	extra_cmd=$extra_cmd"--batch_size ${batch_size} "
+    fi
+    if [ "${tokenizer}" ]; then
+	extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
+    fi
+    if [ "${tasks}" ]; then
+	extra_cmd=$extra_cmd"--tasks ${tasks} "
+    fi
+    if [ "${intra_op_num_threads}" ]; then
+	extra_cmd=$extra_cmd"--intra_op_num_threads ${intra_op_num_threads} "
     fi
 
-    python main.py \
-      --model_path="${input_model}" \
-      --batch_size="${batch_size-1}" \
-      --tokenizer="${tokenizer-meta-llama/Llama-2-7b-hf}" \
-      --tasks="${tasks-lambada_openai}" \
-      --mode="${mode}" \
-      --intra_op_num_threads="${intra_op_num_threads-24}" \
-      --benchmark \
-      ${extra_cmd}
+    extra_cmd=$extra_cmd"--benchmark"
+    eval "python main.py --model_path ${input_model} --mode ${mode} ${extra_cmd}"
 
 }
 

diff --git a/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh b/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh
@@ -12,25 +12,25 @@ function init_params {
   do
     case $var in
       --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
+          input_model=$(echo "$var" |cut -f2 -d=)
       ;;
       --output_model=*)
-          output_model=$(echo $var |cut -f2 -d=)
+          output_model=$(echo "$var" |cut -f2 -d=)
       ;;
       --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
+          batch_size=$(echo "$var" |cut -f2 -d=)
       ;;
       --dataset=*)
-          dataset=$(echo $var |cut -f2 -d=)
+          dataset=$(echo "$var" |cut -f2 -d=)
       ;;
       --tokenizer=*)
-          tokenizer=$(echo $var |cut -f2 -d=)
+          tokenizer=$(echo "$var" |cut -f2 -d=)
       ;;
       --algorithm=*)
-          algorithm=$(echo $var |cut -f2 -d=)
+          algorithm=$(echo "$var" |cut -f2 -d=)
       ;;
       --quant_format=*)
-          quant_format=$(echo $var |cut -f2 -d=)
+          quant_format=$(echo "$var" |cut -f2 -d=)
       ;;
     esac
   done
@@ -59,31 +59,42 @@ function run_tuning {
 	echo "Created directory $output_model"
     fi
 
+    extra_cmd=""
+
     if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
         nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul"
-        extra_cmd="--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True"
+        extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True "
     fi
     if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then
         nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul"
-        extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
+        extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
     fi
     if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then
         nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul"
-        extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
+        extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
+    fi
+
+    if [ "${tokenizer}" ]; then
+	extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
+    fi
+    if [ "${batch_size}" ]; then
+	extra_cmd=$extra_cmd"--batch_size ${batch_size} "
+    fi
+    if [ "${dataset}" ]; then
+	extra_cmd=$extra_cmd"--dataset ${dataset} "
+    fi
+    if [ "${algorithm}" ]; then
+	extra_cmd=$extra_cmd"--algorithm ${algorithm} "
+    fi
+    if [ "${tasks}" ]; then
+	extra_cmd=$extra_cmd"--tasks ${tasks} "
+    fi
+    if [ "${quant_format}" ]; then
+	extra_cmd=$extra_cmd"--quant_format ${quant_format} "
     fi
 
-    python main.py \
-      --model_path "${input_model}" \
-      --tokenizer "${tokenizer-meta-llama/Llama-2-7b-hf}" \
-      --output_model "${output_model}" \
-      --batch_size "${batch_size-1}" \
-      --dataset "${dataset-NeelNanda/pile-10k}" \
-      --algorithm "${algorithm-WOQ_TUNE}" \
-      --tasks "${tasks-lambada_openai}" \
-      --quant_format "${quant_format-QOperator}" \
-      --layer_wise \
-      --tune \
-      ${extra_cmd}
+    extra_cmd=$extra_cmd"--layer_wise --tune"
+    eval "python main.py --model_path ${input_model} --output_model ${output_model} ${extra_cmd}"
 }
 
 main "$@"
diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py
@@ -222,14 +222,10 @@ def calculate_scale_zp(rmin, rmax, qType, sym, reduce_range=False):
     dtype = _qType_to_np_type(qType)
     if isinstance(rmax, np.ndarray):
         if sym:
-            mask = abs(rmin) > abs(rmax)
-            scale = np.ones(rmin.shape).astype(rmin.dtype)
-            scale[mask] = rmin[mask]
-            scale[~mask] = rmax[~mask]
-            abs_max = round((qmax - qmin) / 2)
-            scale /= abs_max
-        else:
-            scale = (rmax - rmin) / (qmax - qmin)
+            max_range = np.maximum(abs(rmin), abs(rmax))
+            rmin = -max_range
+            rmax = max_range
+        scale = (rmax - rmin) / (qmax - qmin)
         scale[abs(scale) < np.finfo(rmax.dtype).tiny] = 1
         zero_point = (
             np.multiply(np.ones(rmax.shape), np.round((qmax + qmin) / 2.0)).astype(dtype)
@@ -612,6 +608,7 @@ def dump_woq_stats(model, quantize_config):
 
         if optype not in res:
             res[optype] = {}
+
         if re.match("^.*_Q\d*G\d*", node.input[1]):
             Q_position = re.search("_Q\d*", node.input[1])
             full_position = re.search("_Q\d*G\d*", node.input[1])

diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -192,7 +192,7 @@ def smooth_quant_entry(
         calibration_data_reader,
         execution_provider=getattr(quant_config, "execution_provider", "CPUExecutionProvider"),
     )
-    smoothed_model = smoother.transform(**quant_config.to_dict())
+    smoothed_model = smoother.transform(**quant_config.get_model_params_dict())
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir:
         # ORT quant API requires str input
         onnx.save_model(