Skip to content

Commit

Permalink
[Experimental Feature]support for common hf multimodel (#276)
Browse files Browse the repository at this point in the history
Signed-off-by: n1ck-guo <[email protected]>


Signed-off-by: n1ck-guo <[email protected]>
Co-authored-by: wenhuach21 <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Oct 31, 2024
1 parent 4f22871 commit e643212
Show file tree
Hide file tree
Showing 25 changed files with 1,326 additions and 127 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ AutoRound
===========================
<h3> Advanced Quantization Algorithm for LLMs</h3>

[![python](https://img.shields.io/badge/python-3.8%2B-blue)](https://github.com/intel/auto-round)
[![python](https://img.shields.io/badge/python-3.9%2B-blue)](https://github.com/intel/auto-round)
[![version](https://img.shields.io/badge/release-0.3.1-green)](https://github.com/intel/auto-round)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/intel/auto-round/blob/main/LICENSE)
---
Expand Down
3 changes: 2 additions & 1 deletion auto_round/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .autoround import AutoRound, AutoAdamRound, AutoOPTRound
from .autoround import AutoRound, AutoRoundAdam, AutoRoundOPT
from .mllm import AutoRoundMLLM
from .auto_quantizer import AutoHfQuantizer,AutoRoundConfig
from .version import __version__
371 changes: 279 additions & 92 deletions auto_round/__main__.py

Large diffs are not rendered by default.

44 changes: 23 additions & 21 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from transformers import set_seed
from torch import autocast
from tqdm import tqdm
import accelerate

from .quantizer import WrapperMultiblock, wrapper_block, unwrapper_block, WrapperLinear, unwrapper_layer, \
WrapperTransformerConv1d
Expand All @@ -48,10 +49,9 @@
get_layer_names_in_block,
mv_module_from_gpu,
unsupport_meta_device, detect_device_count, clear_memory,
get_multimodal_block_names,
)

from .low_cpu_mem.utils import get_layers_before_block
import accelerate


class AutoRound(object):
Expand Down Expand Up @@ -529,11 +529,10 @@ def calib(self, nsamples, bs):
for key in data.keys():
data_new[key] = to_device(data[key], self.model.device)
if key == 'images':
data_new[key] = to_dtype(data[key], self.model.dtype)
data_new[key] = to_dtype(data_new[key], self.model.dtype)
input_ids = data_new["input_ids"]
if input_ids.shape[-1] < self.seqlen:
continue

try:
if isinstance(data_new, torch.Tensor):
self.model(data_new)
Expand All @@ -544,7 +543,7 @@ def calib(self, nsamples, bs):
except NotImplementedError:
pass
except Exception as error:
logger.error(error)
raise error
total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
if total_cnt >= nsamples:
break
Expand Down Expand Up @@ -595,18 +594,21 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
)
self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
clear_memory()
except:
logger.info("switch to cpu to cache inputs")
if "lm_head" in self.layer_config and self.layer_config["lm_head"]["bits"] < 8:
logger.warning(f"we strongly recommend using additional CUDA/HPU devices,e.g. "
f"'CUDA_VISIBLE_DEVICES=0,1 python xxx',"
f" for optimal performance during calibration when enabling lm-head quantization. "
f"Otherwise, the process may be significantly slower.")
self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
clear_memory()
all_inputs = self.cache_inter_data(
block_names, nsamples, layer_names=layer_names, last_cache_name=last_cache_name
)
except RuntimeError as e:
if "CUDA out of memory" in str(e):
logger.info("switch to cpu to cache inputs")
if "lm_head" in self.layer_config and self.layer_config["lm_head"]["bits"] < 8:
logger.warning(f"we strongly recommend using additional CUDA/HPU devices,e.g. "
f"'CUDA_VISIBLE_DEVICES=0,1 python xxx',"
f" for optimal performance during calibration when enabling lm-head quantization. "
f"Otherwise, the process may be significantly slower.")
self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
clear_memory()
all_inputs = self.cache_inter_data(
block_names, nsamples, layer_names=layer_names, last_cache_name=last_cache_name
)
else:
raise
return all_inputs

@torch.no_grad()
Expand Down Expand Up @@ -1330,7 +1332,7 @@ def step(self, scaler, optimizer, lr_schedule):
lr_schedule.step()


class AutoOPTRound(AutoRound):
class AutoRoundOPT(AutoRound):
"""Class for automatic rounding-based quantization with optimizers like adamw of a PyTorch model.
Args:
Expand Down Expand Up @@ -1413,7 +1415,7 @@ def __init__(
optimizer="AdamW",
**kwargs,
):
super(AutoOPTRound, self).__init__(
super(AutoRoundOPT, self).__init__(
model=model,
tokenizer=tokenizer,
bits=bits,
Expand Down Expand Up @@ -1493,7 +1495,7 @@ def step(self, scaler, optimizer, lr_schedule):
htcore.mark_step()


class AutoAdamRound(AutoOPTRound):
class AutoRoundAdam(AutoRoundOPT):
"""Class for automatic rounding-based quantization with optimizers like adamw of a PyTorch model.
The default lr has been changed.
Expand Down Expand Up @@ -1577,7 +1579,7 @@ def __init__(
optimizer="AdamW",
**kwargs,
):
super(AutoAdamRound, self).__init__(
super(AutoRoundAdam, self).__init__(
model=model,
tokenizer=tokenizer,
bits=bits,
Expand Down
49 changes: 49 additions & 0 deletions auto_round/mllm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# AutoRound for MLLMs
## API Usage (Gaudi2/CPU/GPU)
```python
from auto_round import AutoRoundMLLM
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer

model_name = "Qwen/Qwen2-VL-2B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=trust_remote_code)
tokenizer.processor = processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, trust_remote_code=True)
dataset = "/path/to/llava.json"
extra_data_dir = "/path/to/images/dir"

bits, group_size = 4, 128
autoround = AutoRoundMLLM(model, tokenizer, bits=bits, group_size=group_size, dataset=dataset, extra_data_dir=extra_data_dir)

autoround.quantize()
output_dir = "./tmp_autoround"
autoround.save_quantized(output_dir, format='auto_round', inplace=True)
```

## Template
For autoround MLLMs, using Template to customize different operations for different models. User can add a custom chat template through json file as below.
```json
{
"model_type": "qwen2_vl",
"format_user": "<|im_start|>user\n{{content}}<|im_end|>\n",
"format_assistant": "<|im_start|>assistant\n{{content}}<|im_end|>\n",
"format_system": "<|im_start|>system\n{{content}}<|im_end|>\n",
"format_observation": "<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n",
"format_separator": "\n",
"default_system": "You are a helpful assistant.",
"replace_tokens": ["<image>", "<|vision_start|><|image_pad|><|vision_end|>"],
"processor": "qwen2_vl" }
```
The special token ```{{content}}``` is a placeholder to tell the preprocessor where to fill in the corresponding dialogue content.

```format_*```: Add specific token to chat content depends on different role names.

For example, the input conversations:<br>
```[{'role': 'user', 'value': '<image>\nWhat are the colors of the bus in the image?'}, {'role': 'assistant', 'value': 'The bus in the image is white and red.'}]```

Using the above template, the input will be converted to the specified format required by Qwen2-vl as below: <br>
```'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>\nWhat are the colors of the bus in the image?<|im_end|>\n<|im_start|>assistant\nThe bus in the image is white and red.<|im_end|>\n<|im_start|>user\nWhat feature can be seen on the back of the bus?<|im_end|>\n<|im_start|>assistant\nThe back of the bus features an advertisement.<|im_end|>\n<|im_start|>user\nIs the bus driving down the street or pulled off to the side?<|im_end|>\n<|im_start|>assistant\nThe bus is driving down the street, which is crowded with people and other vehicles.<|im_end|>\n'```.

## Processor
Processor is callback interface for calling different processors, such as texts or images processors, for MLLMs. User can define own processor and use registration function to declare. For more information, please refer to the relevant code in ```auto_round/mllm/processor.py```.
17 changes: 17 additions & 0 deletions auto_round/mllm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .mllm_dataset import get_mllm_dataloader
from .template import Template, get_template, TEMPLATES
from .autoround_mllm import AutoRoundMLLM
Loading

0 comments on commit e643212

Please sign in to comment.