diff --git a/src/brevitas_examples/llm/llm_quant/bias_corr.py b/src/brevitas_examples/llm/llm_quant/bias_corr.py index dc603f8a3..049ae0baa 100644 --- a/src/brevitas_examples/llm/llm_quant/bias_corr.py +++ b/src/brevitas_examples/llm/llm_quant/bias_corr.py @@ -4,6 +4,7 @@ """ import torch +from tqdm import tqdm from brevitas.graph.calibrate import bias_correction_mode @@ -11,5 +12,5 @@ @torch.no_grad() def apply_bias_correction(model, dataloader): with bias_correction_mode(model): - for inps in dataloader: + for inps in tqdm(dataloader): model(**inps) diff --git a/src/brevitas_examples/llm/llm_quant/data_utils.py b/src/brevitas_examples/llm/llm_quant/data_utils.py new file mode 100644 index 000000000..5375fcddf --- /dev/null +++ b/src/brevitas_examples/llm/llm_quant/data_utils.py @@ -0,0 +1,108 @@ +""" +Adapted from https://github.com/huggingface/optimum-amd, released under the following LICENSE: + +MIT License + +Copyright (c) 2023 Hugging Face + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import random +from typing import Any, Optional, Union + +import numpy as np +from optimum.amd.brevitas.data_utils import DatasetToDevice +from optimum.amd.brevitas.data_utils import get_c4 +from optimum.amd.brevitas.data_utils import get_wikitext2 +from optimum.utils.normalized_config import NormalizedConfigManager +import torch +from transformers import AutoConfig + + +def get_dataset_for_model( + model_name_or_path: str, + dataset_name: str, + tokenizer: Any, + nsamples: int = 128, + seqlen: int = 2048, + seed: int = 0, + split: str = "train", + fuse_sequences: bool = True, + require_fx: bool = False, + device: Optional[Union[str, torch.device]] = None, +): + random.seed(seed) + np.random.seed(seed) + torch.random.manual_seed(seed) + get_dataset_map = { + "wikitext2": get_wikitext2, + "c4": get_c4,} + if split not in ["train", "validation"]: + raise ValueError(f"The split need to be 'train' or 'validation' but found {split}") + if dataset_name not in get_dataset_map: + raise ValueError( + f"Expected a value in {list(get_dataset_map.keys())} but found {dataset_name}") + get_dataset_fn = get_dataset_map[dataset_name] + + data = get_dataset_fn( + tokenizer=tokenizer, + nsamples=nsamples, + seqlen=seqlen, + split=split, + fuse_sequences=fuse_sequences, + seed=seed) + + # In case the dataset is loaded to be used with an fx.GraphModule, we need to add empty past_key_values inputs in the dataset. + if require_fx: + config = AutoConfig.from_pretrained(model_name_or_path) + + normalized_config_class = NormalizedConfigManager.get_normalized_config_class( + config.model_type) + normalized_config = normalized_config_class(config) + + num_heads = normalized_config.num_attention_heads + if hasattr(normalized_config, "num_key_value_heads"): + num_kv_heads = normalized_config.num_key_value_heads + else: + num_kv_heads = num_heads + head_dim = normalized_config.hidden_size // num_heads + num_layers = normalized_config.num_layers + + for sample in data: + sample["past_key_values"] = tuple(( + torch.zeros( + 1, + num_kv_heads, + 0, + head_dim, + device=sample["input_ids"].device, + dtype=sample["input_ids"].dtype), + torch.zeros( + 1, + num_kv_heads, + 0, + head_dim, + device=sample["input_ids"].device, + dtype=sample["input_ids"].dtype), + ) for _ in range(num_layers)) + + data = DatasetToDevice(data, device=device) + + return data diff --git a/src/brevitas_examples/llm/llm_quant/eval.py b/src/brevitas_examples/llm/llm_quant/eval.py index 271a5b36e..0691e5cfa 100644 --- a/src/brevitas_examples/llm/llm_quant/eval.py +++ b/src/brevitas_examples/llm/llm_quant/eval.py @@ -34,12 +34,12 @@ def create_validation_dataloader(data, seqlen, device): @torch.no_grad() def model_eval(model, valenc, seqlen): nsamples = len(valenc) - dev = next(iter(model.parameters())).device with torch.no_grad(): nlls = [] for inps in valenc: lm_logits = model(**inps)['logits'] shift_logits = lm_logits[:, :-1, :].contiguous() + dev = shift_logits.device shift_labels = inps['input_ids'][:, 1:].to(dev) loss_fct = nn.CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) diff --git a/src/brevitas_examples/llm/llm_quant/ln_affine_merge.py b/src/brevitas_examples/llm/llm_quant/ln_affine_merge.py index 37aa8d5d3..7ac39347f 100644 --- a/src/brevitas_examples/llm/llm_quant/ln_affine_merge.py +++ b/src/brevitas_examples/llm/llm_quant/ln_affine_merge.py @@ -6,7 +6,6 @@ import torch from torch import nn -from brevitas.fx import value_trace from brevitas.graph.equalize import _is_reshaping_op from brevitas.graph.equalize import _is_scale_invariant_module from brevitas.graph.utils import get_module @@ -84,9 +83,8 @@ def merge_layernorm_affine_params(graph_model): @torch.no_grad() -def apply_layernorm_affine_merge(model, dtype, ref_kwargs): +def apply_layernorm_affine_merge(graph_model, dtype): # We can't do fp16 tracing on CPU as many kernels are not implemented # So we have to cast to fp32 first, trace, apply merging, and then cast back - with cast_to_float32(model, dtype): - graph_model = value_trace(model, value_args=ref_kwargs) + with cast_to_float32(graph_model, dtype): merge_layernorm_affine_params(graph_model) diff --git a/src/brevitas_examples/llm/llm_quant/prepare_for_quantize.py b/src/brevitas_examples/llm/llm_quant/prepare_for_quantize.py index 2a9505227..d22b2eff1 100644 --- a/src/brevitas_examples/llm/llm_quant/prepare_for_quantize.py +++ b/src/brevitas_examples/llm/llm_quant/prepare_for_quantize.py @@ -1,5 +1,6 @@ import warnings +import torch from transformers.models.opt.modeling_opt import OPTAttention from brevitas.graph import ModuleToModuleByClass @@ -21,3 +22,18 @@ def replace_mha_with_quantizable_layers(model, dtype): for rewriter in rewriters: model = rewriter.apply(model) return model + + +@torch.no_grad() +def add_zero_bias_to_linear(model: torch.nn.Module) -> torch.nn.Module: + for name, module in model.named_modules(): + if type(module) == torch.nn.Linear: + if module.bias is None: + module.register_parameter( + "bias", + torch.nn.Parameter( + torch.zeros((module.weight.shape[0],), + device=module.weight.device, + dtype=module.weight.dtype)), + ) + return model diff --git a/src/brevitas_examples/llm/main.py b/src/brevitas_examples/llm/main.py index 5237c31c7..b8c19a9d7 100644 --- a/src/brevitas_examples/llm/main.py +++ b/src/brevitas_examples/llm/main.py @@ -9,6 +9,7 @@ import numpy as np from optimum.amd.brevitas.accelerate_utils import offload_model from optimum.amd.brevitas.accelerate_utils import remove_hooks +from optimum.amd.brevitas.data_utils import compute_perplexity from optimum.exporters.onnx import onnx_export_from_model import torch from transformers import AutoModelForCausalLM @@ -16,12 +17,14 @@ from brevitas.export import export_torch_qcdq from brevitas.export.onnx.standard.qcdq.manager import StdQCDQONNXManager -from brevitas_examples.common.generative.quantize import quantize_model +from brevitas.graph.quantize import layerwise_quantize +from brevitas_examples.common.generative.quantize import generate_quant_maps +from brevitas_examples.common.generative.quantize import generate_quantizers +from brevitas_examples.common.parse_utils import add_bool_arg from brevitas_examples.common.parse_utils import quant_format_validator from brevitas_examples.llm.llm_quant.bias_corr import apply_bias_correction from brevitas_examples.llm.llm_quant.calibrate import apply_calibration -from brevitas_examples.llm.llm_quant.data import get_c4 -from brevitas_examples.llm.llm_quant.data import get_wikitext2 +from brevitas_examples.llm.llm_quant.data_utils import get_dataset_for_model from brevitas_examples.llm.llm_quant.equalize import apply_act_equalization from brevitas_examples.llm.llm_quant.equalize import apply_weight_equalization from brevitas_examples.llm.llm_quant.eval import create_validation_dataloader @@ -30,6 +33,7 @@ from brevitas_examples.llm.llm_quant.export import brevitas_proxy_export_mode from brevitas_examples.llm.llm_quant.gptq import apply_gptq from brevitas_examples.llm.llm_quant.ln_affine_merge import apply_layernorm_affine_merge +from brevitas_examples.llm.llm_quant.prepare_for_quantize import add_zero_bias_to_linear from brevitas_examples.llm.llm_quant.prepare_for_quantize import replace_mha_with_quantizable_layers from brevitas_examples.llm.llm_quant.run_utils import cast_to_float32 from brevitas_examples.llm.llm_quant.run_utils import CastFloat16ToFloat32 @@ -47,7 +51,13 @@ parser.add_argument( '--nsamples', type=int, default=128, help='Number of calibration data samples. Default: 128.') parser.add_argument('--seqlen', type=int, default=2048, help='Sequence length. Default: 2048.') -parser.add_argument('--eval', action='store_true', help='Eval model PPL on C4.') +parser.add_argument('--eval', action='store_true', help='Eval model PPL on the chosen Dataset.') +parser.add_argument( + '--dataset', + type=str, + choices=['wikitext2', 'c4'], + default='wikitext2', + help='Dataset to use for quantization (default: %(default)s)') parser.add_argument('--weight-bit-width', type=int, default=8, help='Weight bit width. Default: 8.') parser.add_argument( '--weight-param-method', @@ -135,8 +145,6 @@ help='Group size for per_group input quantization. Default: 64.') parser.add_argument( '--quantize-input-zero-point', action='store_true', help='Quantize input zero-point.') -parser.add_argument( - '--quantize-embedding', action='store_true', help='Quantize first nn.Embedding layer.') parser.add_argument( '--quantize-last-layer', action='store_true', help='Quantize last nn.Linear layer.') parser.add_argument('--gptq', action='store_true', help='Apply GPTQ.') @@ -174,6 +182,15 @@ 'sharded_torchmlir_group_weight', 'sharded_packed_torchmlir_group_weight'], help='Model export.') +parser.add_argument( + '--checkpoint-name', + type=str, + default=None, + help="Filename to save checkpoint. If `None`, no checkpoint is saved (default: %(default)s)") +add_bool_arg( + parser, 'use-ocp', default=False, help='Use OCP format for float quantization. Default: False') +add_bool_arg( + parser, 'use-fnuz', default=True, help='Use FNUZ format for float quantization. Default: True') def set_seed(seed): @@ -274,19 +291,56 @@ def main(): with CastFloat16ToFloat32(): apply_awq(model, awq_results) - calibration_loader = get_wikitext2( - nsamples=args.nsamples, tokenizer=tokenizer, seqlen=args.seqlen, seed=0) - val_data = get_wikitext2( - nsamples=args.nsamples, tokenizer=tokenizer, seqlen=args.seqlen, split='validation', seed=0) + require_fx = True if args.weight_equalization or args.act_equalization == 'fx' or args.ln_affine_merge else False + fuse_sequences = False + + # Load the data for calibration and evaluation. + calibration_loader = get_dataset_for_model( + args.model, + dataset_name=args.dataset, + tokenizer=tokenizer, + nsamples=args.nsamples, + seqlen=args.seqlen, + split="train", + seed=args.seed, + require_fx=require_fx, + device=None, + fuse_sequences=fuse_sequences, + ) + + validation_loader = get_dataset_for_model( + args.model, + dataset_name=args.dataset, + tokenizer=tokenizer, + nsamples=args.nsamples, + seqlen=args.seqlen, + split="validation", + seed=args.seed, + require_fx=require_fx, + device=None, + fuse_sequences=fuse_sequences, + ) + device = next(iter(model.parameters())).device - val_data = create_validation_dataloader(val_data, args.seqlen, device) print("Data loaded.") + if args.eval: + assert args.export_target != 'torch_qcdq', "TorchScript QCDQ export and Evaluation simultaneously" + print("Float model eval...") + model = offload_model(model) + ppl = compute_perplexity( + model, validation_loader, context_length=args.seqlen // 2, tokenizer=tokenizer) + remove_hooks(model) + print(f"Float perplexity ({args.dataset}): {ppl}") + + if require_fx: + model = get_fx(model) + # Apply LN affine merging before inserting MHA layers # since currently there is support only for merging into Linear if args.ln_affine_merge: print("Apply LN affine merge...") - apply_layernorm_affine_merge(model, dtype, ref_kwargs={'input_ids': calibration_loader[0]}) + apply_layernorm_affine_merge(model, dtype) print("LN affine merge applied.") # Insert standard MHA layers when performing fx based weight/act equalization to avoid dealing @@ -296,11 +350,6 @@ def main(): model = replace_mha_with_quantizable_layers(model, dtype) print("Replacing done.") - if args.weight_equalization or args.act_equalization == 'fx': - model = get_fx(model) - calibration_loader = modify_dataloader(args.model, calibration_loader, dtype=dtype) - val_data = modify_dataloader(args.model, val_data, dtype=dtype) - if args.weight_equalization: print("Apply weight equalization...") # In case of float16 model, we need to offload to account for missing ops @@ -317,39 +366,58 @@ def main(): remove_hooks(model) if not args.no_quantize: + name_blacklist = [] print("Applying model quantization...") - model = quantize_model( - model, + linear_input_quant, weight_quant, input_quant, q_scaled_quant, k_transposed_quant, v_quant, attn_output_weights_quant = generate_quantizers( dtype=dtype, - weight_quant_format=args.weight_quant_format, - weight_quant_type=args.weight_quant_type, weight_bit_width=args.weight_bit_width, weight_param_method=args.weight_param_method, weight_scale_precision=args.weight_scale_precision, + weight_quant_type=args.weight_quant_type, weight_quant_granularity=args.weight_quant_granularity, weight_group_size=args.weight_group_size, quantize_weight_zero_point=args.quantize_weight_zero_point, + weight_quant_format=args.weight_quant_format, input_bit_width=args.input_bit_width, - input_quant_type=args.input_quant_type, input_quant_format=args.input_quant_format, - input_param_method=args.input_param_method, input_scale_precision=args.input_scale_precision, input_scale_type=args.input_scale_type, + input_param_method=args.input_param_method, + input_quant_type=args.input_quant_type, input_quant_granularity=args.input_quant_granularity, input_group_size=args.input_group_size, quantize_input_zero_point=args.quantize_input_zero_point, - quantize_embedding=args.quantize_embedding) + use_ocp=args.use_ocp, + use_fnuz=args.use_fnuz, + device=device) + layer_map = generate_quant_maps( + linear_input_quant=linear_input_quant, + weight_quant=weight_quant, + input_quant=input_quant, + q_scaled_quant=q_scaled_quant, + k_transposed_quant=k_transposed_quant, + v_quant=v_quant, + attn_output_weights_quant=attn_output_weights_quant, + dtype=dtype, + device=device, + input_quant_format=args.input_quant_format, + quantize_embedding=False) + if not args.quantize_last_layer: + name_blacklist += ["lm_head"] + model = layerwise_quantize( + model=model, compute_layer_map=layer_map, name_blacklist=name_blacklist) # Tie back first/last layer weights in case they got untied print("Model quantization applied.") # If any equalization has taken places, the embedding layer and the fully connected one are # not tied anymore, and they need to be treated as standalone, separate layers. # In all other cases we can tie them back so to preserve memory. - if args.act_equalization is None and not args.weight_equalization: + if args.act_equalization is None and not require_fx: model.tie_weights() - with cast_to_float32(model, dtype): - model(**calibration_loader[0]) + if args.bias_corr: + model = add_zero_bias_to_linear(model) + model = offload_model(model) if args.act_calibration: @@ -369,10 +437,15 @@ def main(): if args.eval: print("Model eval...") - ppl = model_eval(model, val_data, args.seqlen) - print(f"C4 perplexity: {ppl}") + ppl = compute_perplexity( + model, validation_loader, context_length=args.seqlen // 2, tokenizer=tokenizer) + print(f"Quantized perplexity ({args.dataset}): {ppl}") remove_hooks(model) + if args.checkpoint_name is not None: + print(f"Saving checkpoint to {args.checkpoint_name}") + torch.save(model.state_dict(), args.checkpoint_name) + if args.export_target: print(f"Export to {args.export_target}") # Currently we always export on CPU with a float32 container to avoid float16 CPU errors