[Experimental Feature]support for common hf multimodel (#276)

Signed-off-by: n1ck-guo <[email protected]> Signed-off-by: n1ck-guo <[email protected]> Co-authored-by: wenhuach21 <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
intel · Oct 31, 2024 · e643212 · e643212
1 parent 4f22871
commit e643212
Show file tree

Hide file tree

Showing 25 changed files with 1,326 additions and 127 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ AutoRound
 ===========================
 <h3> Advanced Quantization Algorithm for LLMs</h3>
 
-[![python](https://img.shields.io/badge/python-3.8%2B-blue)](https://github.com/intel/auto-round)
+[![python](https://img.shields.io/badge/python-3.9%2B-blue)](https://github.com/intel/auto-round)
 [![version](https://img.shields.io/badge/release-0.3.1-green)](https://github.com/intel/auto-round)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/intel/auto-round/blob/main/LICENSE)
 ---

diff --git a/auto_round/__init__.py b/auto_round/__init__.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .autoround import AutoRound, AutoAdamRound, AutoOPTRound
+from .autoround import AutoRound, AutoRoundAdam, AutoRoundOPT
+from .mllm import AutoRoundMLLM
 from .auto_quantizer import AutoHfQuantizer,AutoRoundConfig
 from .version import __version__
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -22,6 +22,7 @@
 from transformers import set_seed
 from torch import autocast
 from tqdm import tqdm
+import accelerate
 
 from .quantizer import WrapperMultiblock, wrapper_block, unwrapper_block, WrapperLinear, unwrapper_layer, \
     WrapperTransformerConv1d
@@ -48,10 +49,9 @@
     get_layer_names_in_block,
     mv_module_from_gpu,
     unsupport_meta_device, detect_device_count, clear_memory,
+    get_multimodal_block_names,
 )
-
 from .low_cpu_mem.utils import get_layers_before_block
-import accelerate
 
 
 class AutoRound(object):
@@ -529,11 +529,10 @@ def calib(self, nsamples, bs):
                 for key in data.keys():
                     data_new[key] = to_device(data[key], self.model.device)
                     if key == 'images':
-                        data_new[key] = to_dtype(data[key], self.model.dtype)
+                        data_new[key] = to_dtype(data_new[key], self.model.dtype)
                 input_ids = data_new["input_ids"]
             if input_ids.shape[-1] < self.seqlen:
                 continue
-
             try:
                 if isinstance(data_new, torch.Tensor):
                     self.model(data_new)
@@ -544,7 +543,7 @@ def calib(self, nsamples, bs):
             except NotImplementedError:
                 pass
             except Exception as error:
-                logger.error(error)
+                raise error
             total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
             if total_cnt >= nsamples:
                 break
@@ -595,18 +594,21 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
             )
             self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
             clear_memory()
-        except:
-            logger.info("switch to cpu to cache inputs")
-            if "lm_head" in self.layer_config and self.layer_config["lm_head"]["bits"] < 8:
-                logger.warning(f"we strongly recommend using additional CUDA/HPU devices,e.g. "
-                               f"'CUDA_VISIBLE_DEVICES=0,1 python xxx',"
-                               f" for optimal performance during calibration when enabling lm-head quantization. "
-                               f"Otherwise, the process may be significantly slower.")
-            self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
-            clear_memory()
-            all_inputs = self.cache_inter_data(
-                block_names, nsamples, layer_names=layer_names, last_cache_name=last_cache_name
-            )
+        except RuntimeError as e:
+            if "CUDA out of memory" in str(e):
+                logger.info("switch to cpu to cache inputs")
+                if "lm_head" in self.layer_config and self.layer_config["lm_head"]["bits"] < 8:
+                    logger.warning(f"we strongly recommend using additional CUDA/HPU devices,e.g. "
+                                f"'CUDA_VISIBLE_DEVICES=0,1 python xxx',"
+                                f" for optimal performance during calibration when enabling lm-head quantization. "
+                                f"Otherwise, the process may be significantly slower.")
+                self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
+                clear_memory()
+                all_inputs = self.cache_inter_data(
+                    block_names, nsamples, layer_names=layer_names, last_cache_name=last_cache_name
+                )
+            else:
+                raise
         return all_inputs
 
     @torch.no_grad()
@@ -1330,7 +1332,7 @@ def step(self, scaler, optimizer, lr_schedule):
         lr_schedule.step()
 
 
-class AutoOPTRound(AutoRound):
+class AutoRoundOPT(AutoRound):
     """Class for automatic rounding-based quantization with optimizers like adamw of a PyTorch model.
 
     Args:
@@ -1413,7 +1415,7 @@ def __init__(
             optimizer="AdamW",
             **kwargs,
     ):
-        super(AutoOPTRound, self).__init__(
+        super(AutoRoundOPT, self).__init__(
             model=model,
             tokenizer=tokenizer,
             bits=bits,
@@ -1493,7 +1495,7 @@ def step(self, scaler, optimizer, lr_schedule):
             htcore.mark_step()
 
 
-class AutoAdamRound(AutoOPTRound):
+class AutoRoundAdam(AutoRoundOPT):
     """Class for automatic rounding-based quantization with optimizers like adamw of a PyTorch model.
     The default lr has been changed.
 
@@ -1577,7 +1579,7 @@ def __init__(
             optimizer="AdamW",
             **kwargs,
     ):
-        super(AutoAdamRound, self).__init__(
+        super(AutoRoundAdam, self).__init__(
             model=model,
             tokenizer=tokenizer,
             bits=bits,

diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
@@ -0,0 +1,49 @@
+# AutoRound for MLLMs
+## API Usage (Gaudi2/CPU/GPU)
+```python
+from auto_round import AutoRoundMLLM
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
+
+model_name = "Qwen/Qwen2-VL-2B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+tokenizer.processor = processor
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    model_name, trust_remote_code=True) 
+dataset = "/path/to/llava.json"
+extra_data_dir = "/path/to/images/dir" 
+
+bits, group_size = 4, 128
+autoround = AutoRoundMLLM(model, tokenizer, bits=bits, group_size=group_size, dataset=dataset, extra_data_dir=extra_data_dir)
+
+autoround.quantize()
+output_dir = "./tmp_autoround"
+autoround.save_quantized(output_dir, format='auto_round', inplace=True)
+```
+
+## Template
+For autoround MLLMs, using Template to customize different operations for different models. User can add a custom chat template through json file as below.
+```json
+{
+    "model_type": "qwen2_vl",
+    "format_user": "<|im_start|>user\n{{content}}<|im_end|>\n",
+    "format_assistant": "<|im_start|>assistant\n{{content}}<|im_end|>\n",
+    "format_system": "<|im_start|>system\n{{content}}<|im_end|>\n",
+    "format_observation": "<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n",
+    "format_separator": "\n",
+    "default_system": "You are a helpful assistant.",
+    "replace_tokens": ["<image>", "<|vision_start|><|image_pad|><|vision_end|>"],
+    "processor": "qwen2_vl" }
+```
+The special token ```{{content}}``` is a placeholder to tell the preprocessor where to fill in the corresponding dialogue content.
+
+```format_*```: Add specific token to chat content depends on different role names.
+
+For example, the input conversations:<br>
+ ```[{'role': 'user', 'value': '<image>\nWhat are the colors of the bus in the image?'}, {'role': 'assistant', 'value': 'The bus in the image is white and red.'}]```
+
+Using the above template, the input will be converted to the specified format required by Qwen2-vl as below: <br>
+ ```'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>\nWhat are the colors of the bus in the image?<|im_end|>\n<|im_start|>assistant\nThe bus in the image is white and red.<|im_end|>\n<|im_start|>user\nWhat feature can be seen on the back of the bus?<|im_end|>\n<|im_start|>assistant\nThe back of the bus features an advertisement.<|im_end|>\n<|im_start|>user\nIs the bus driving down the street or pulled off to the side?<|im_end|>\n<|im_start|>assistant\nThe bus is driving down the street, which is crowded with people and other vehicles.<|im_end|>\n'```.
+
+## Processor
+Processor is callback interface for calling different processors, such as texts or images processors, for MLLMs. User can define own processor and use registration function to declare. For more information, please refer to the relevant code in ```auto_round/mllm/processor.py```.
diff --git a/auto_round/mllm/__init__.py b/auto_round/mllm/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mllm_dataset import get_mllm_dataloader
+from .template import Template, get_template, TEMPLATES
+from .autoround_mllm import AutoRoundMLLM