From a5c322ac447e7c0bc26ee50550d893d5cd9a858c Mon Sep 17 00:00:00 2001 From: wenhuach21 <108330088+wenhuach21@users.noreply.github.com> Date: Mon, 17 Jun 2024 15:08:05 +0800 Subject: [PATCH] fix bug at whole block is excluded from quantization (#156) --- auto_round/autoround.py | 10 ++++++++-- test/test_autoround.py | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 5d135932..878eda46 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -824,8 +824,6 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch output = self.get_block_outputs(block, input_ids, input_others, self.train_bs, device, self.cache_device) if q_input is not None: - for i in range(len(input_ids)): - input_ids[i] = None input_ids = q_input torch.cuda.empty_cache() quantized_layer_names, unquantized_layer_names = wrapper_block(block, self.enable_minmax_tuning) @@ -845,6 +843,14 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch else: optimizer = self.optimizer(round_params, lr=self.lr, weight_decay=0) + if len(round_params) + len(minmax_params) <= 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block" + ) + logger.info(dump_info) + return output, output + if self.lr_scheduler is None: lr_schedule = torch.optim.lr_scheduler.LinearLR( optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters, verbose=False diff --git a/test/test_autoround.py b/test/test_autoround.py index d871705f..aff2f53f 100644 --- a/test/test_autoround.py +++ b/test/test_autoround.py @@ -33,6 +33,29 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + def test_remove_whole_block(self): + weight_config={"model.decoder.layers.0.self_attn.k_proj":{"data_type":"float"}, + "model.decoder.layers.0.self_attn.v_proj": {"data_type": "float"}, + "model.decoder.layers.0.self_attn.q_proj": {"data_type": "float"}, + "model.decoder.layers.0.self_attn.out_proj": {"data_type": "float"}, + "model.decoder.layers.0.fc1": {"data_type": "float"}, + "model.decoder.layers.0.fc2": {"data_type": "float"}, + } + bits, group_size, sym = 4, 128, False + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + weight_config=weight_config + ) + autoround.quantize() + + def test_default(self): bits, group_size, sym = 4, 128, False autoround = AutoRound(