Skip to content

Commit

Permalink
Merge branch 'main' into add_gpu_ut
Browse files Browse the repository at this point in the history
  • Loading branch information
wenhuach21 authored Dec 6, 2024
2 parents 92d1bc7 + e4528e9 commit 770295e
Show file tree
Hide file tree
Showing 12 changed files with 164 additions and 42 deletions.
11 changes: 8 additions & 3 deletions .azure-pipelines/scripts/ut/run_ut_hpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@ LOG_DIR=/auto-round/log_dir
mkdir -p ${LOG_DIR}
ut_log_name=${LOG_DIR}/ut.log

find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
cat run.sh
bash run.sh 2>&1 | tee ${ut_log_name}
find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh

cat run_lazy.sh
bash run_lazy.sh 2>&1 | tee ${ut_log_name}

cat run_compile.sh
bash run_compile.sh 2>&1 | tee ${ut_log_name}

cp report.html ${LOG_DIR}/
cp coverage.xml ${LOG_DIR}/
Expand Down
3 changes: 0 additions & 3 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ def run_lmms():
lmms_eval(args)

def switch():
# if "--lmms" in sys.argv:
# sys.argv.remove("--lmms")
# run_lmms()
if "--mllm" in sys.argv:
sys.argv.remove("--mllm")
run_mllm()
Expand Down
3 changes: 3 additions & 0 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -1263,6 +1263,9 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
self.model.save_pretrained(output_dir)
if self.tokenizer is not None:
self.tokenizer.save_pretrained(output_dir)
processor = kwargs.get("processor", None)
if processor is not None:
processor.save_pretrained(output_dir)
return

from auto_round.export import EXPORT_FORMAT
Expand Down
4 changes: 2 additions & 2 deletions auto_round/export/export_to_autogptq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def pack_layer(name, model, layer_config, backend, pbar):
in_features = layer.weight.shape[0]
out_features = layer.weight.shape[1]

##bias = layer.bias is not None and torch.any(layer.bias)
bias = True ## if using the above, llama3 lambada RTN will be NAN , TODO why?
bias = layer.bias is not None
##bias = True ## if using the above, llama3 lambada RTN will be NAN , TODO why?
new_layer = QuantLinear( ##pylint: disable=E1123
bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
)
Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/export_to_autoround/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def pack_layer(name, model, layer_config, backend, pbar):
elif isinstance(layer, transformers.pytorch_utils.Conv1D):
in_features = layer.weight.shape[0]
out_features = layer.weight.shape[1]
bias = layer.bias is not None and torch.any(layer.bias)
bias = layer.bias is not None

if "awq" not in backend:
new_layer = QuantLinear( ##pylint: disable=E1123
Expand Down
25 changes: 21 additions & 4 deletions auto_round/mllm/autoround_mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _only_text_test(model, tokenizer, device):
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
if device != model.device.type:
if device.split(':')[0] != model.device.type:
model = model.to(device)
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
model(**inputs)
Expand Down Expand Up @@ -150,19 +150,20 @@ def __init__(
self.to_quant_block_names = to_quant_block_names
self.extra_data_dir = extra_data_dir
self.quant_nontext_module = quant_nontext_module
self.processor = processor
self.image_processor = image_processor
self.template = template if template is not None else model.config.model_type
if not isinstance(dataset, torch.utils.data.DataLoader):
self.template = get_template(
self.template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor)

dataset = self.template.default_dataset if dataset is None else dataset
dataset = self.template.default_dataset if dataset is None else dataset

from ..calib_dataset import CALIB_DATASETS
from .mllm_dataset import MLLM_DATASET
if isinstance(dataset, str):
if quant_nontext_module or \
(dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer, device)):
(dataset in CALIB_DATASETS.keys() and not \
_only_text_test(model, tokenizer, device)):
if quant_nontext_module:
logger.warning(f"Text only dataset cannot be used for calibrating non-text modules,"
"switching to liuhaotian/llava_conv_58k")
Expand Down Expand Up @@ -372,4 +373,20 @@ def calib(self, nsamples, bs):
m = m.to("meta")
# torch.cuda.empty_cache()

def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs):
"""Save the quantized model to the specified output directory in the specified format.
Args:
output_dir (str, optional): The directory to save the quantized model. Defaults to None.
format (str, optional): The format in which to save the model. Defaults to "auto_round".
inplace (bool, optional): Whether to modify the model in place. Defaults to True.
**kwargs: Additional keyword arguments specific to the export format.
Returns:
object: The compressed model object.
"""
if self.processor is not None and not hasattr(self.processor, "chat_template"):
self.processor.chat_template = None
compressed_model = super().save_quantized(
output_dir=output_dir, format=format, inplace=inplace, processor=self.processor, **kwargs)
return compressed_model
37 changes: 19 additions & 18 deletions auto_round/mllm/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,24 +118,25 @@ def _register_template(

def load_template(path: str):
"""Load template information from a json file."""
data = json.load(open(path, "r"))
if "model_type" not in data:
data["model_type"] = "user_define"
if "replace_tokens" in data and data["replace_tokens"] is not None:
assert len(data["replace_tokens"]) % 2 == 0, \
"the format of replace_tokens should be [old_tag1, replace_tag1, old_tag2, replace_tag2]"
temp = []
for i in range(0, len(data["replace_tokens"]), 2):
temp.append((data["replace_tokens"][i], data["replace_tokens"][i + 1]))
data["replace_tokens"] = temp
if "processor" in data:
assert data["processor"] in PROCESSORS.keys(), \
"{} is not supported, current support: {}".format(data["processor"], ",".join(PROCESSORS.keys()))
data["processor"] = PROCESSORS[data["processor"]]
template = _register_template(
**data
)
return template
with open(path, "r") as file:
data = json.load(file)
if "model_type" not in data:
data["model_type"] = "user_define"
if "replace_tokens" in data and data["replace_tokens"] is not None:
assert len(data["replace_tokens"]) % 2 == 0, \
"the format of replace_tokens should be [old_tag1, replace_tag1, old_tag2, replace_tag2]"
temp = []
for i in range(0, len(data["replace_tokens"]), 2):
temp.append((data["replace_tokens"][i], data["replace_tokens"][i + 1]))
data["replace_tokens"] = temp
if "processor" in data:
assert data["processor"] in PROCESSORS.keys(), \
"{} is not supported, current support: {}".format(data["processor"], ",".join(PROCESSORS.keys()))
data["processor"] = PROCESSORS[data["processor"]]
template = _register_template(
**data
)
return template


def _load_preset_template():
Expand Down
4 changes: 1 addition & 3 deletions auto_round/script/mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,13 +418,11 @@ def tune(args):
inplace = False if len(format_list) > 1 else True
for format_ in format_list:
eval_folder = f'{export_dir}-{format_}'
if processor is not None and not hasattr(processor, "chat_template"):
processor.chat_template = None
safe_serialization = True
if "phi3_v" in model_type:
safe_serialization = False
autoround.save_quantized(
eval_folder, format=format_, inplace=inplace, processor=processor, safe_serialization=safe_serialization)
eval_folder, format=format_, inplace=inplace, safe_serialization=safe_serialization)


def eval(args):
Expand Down
29 changes: 22 additions & 7 deletions auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def extract_block_names_to_str(quant_block_list):
prefixes = [get_common_prefix(blocks) for blocks in quant_block_list]
# Join prefixes into a single string
return ','.join(prefixes)


def find_matching_blocks(model, all_blocks, to_quant_block_names):
"""
Expand Down Expand Up @@ -966,20 +966,36 @@ def torch_version_at_least(version_string):
TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0")
TORCH_VERSION_AT_LEAST_2_4 = torch_version_at_least("2.4.0")

# Note on HPU usage:
# There are two modes available for enabling auto-round on HPU:
# 1. Compile Mode
# 1) Use PyTorch version ≥ 2.4 (Intel® Gaudi® v1.18 or later)
# 2) Set `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`
# The compile mode can speed up quantization process but still in experimental stage.
# 2. Lazy Mode (By default)

def check_hpu_compile_mode():

def _check_hpu_compile_mode():
assert (
os.getenv("PT_HPU_LAZY_MODE") == "0"
os.getenv("PT_HPU_LAZY_MODE") == "0"
), "Please set `PT_HPU_LAZY_MODE=0` to use HPU compile mode"
# Note: this is a temporary solution, will be removed in the future
assert (
os.getenv("PT_ENABLE_INT64_SUPPORT") == "1"
os.getenv("PT_ENABLE_INT64_SUPPORT") == "1"
), "Please set `PT_ENABLE_INT64_SUPPORT=1` to use HPU compile mode"


def is_hpu_lazy_mode():
return os.getenv("PT_HPU_LAZY_MODE") != "0"


def _use_hpu_compile_mode():
return TORCH_VERSION_AT_LEAST_2_4 and not is_hpu_lazy_mode()


def compile_func_on_hpu(func):
if TORCH_VERSION_AT_LEAST_2_4:
check_hpu_compile_mode()
if _use_hpu_compile_mode():
_check_hpu_compile_mode()
return torch.compile(func, backend="hpu_backend")
return func

Expand Down Expand Up @@ -1097,4 +1113,3 @@ def get_fp_layer_names(model, fp_layers):
not_to_quantized_layers.append(name)

return not_to_quantized_layers

9 changes: 9 additions & 0 deletions test/_test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import pytest


def is_pytest_mode_compile():
return pytest.mode == "compile"


def is_pytest_mode_lazy():
return pytest.mode == "lazy"
34 changes: 34 additions & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
from typing import Mapping

import pytest


def pytest_addoption(parser):
parser.addoption(
"--mode",
action="store",
default="lazy",
help="{compile|lazy}, default lazy. Choose mode to run tests",
)


backup_env = pytest.StashKey[Mapping]()


def pytest_configure(config):
pytest.mode = config.getoption("--mode")
assert pytest.mode.lower() in ["lazy", "compile"]

config.stash[backup_env] = os.environ

if pytest.mode == "lazy":
os.environ["PT_HPU_LAZY_MODE"] = "1"
elif pytest.mode == "compile":
os.environ["PT_HPU_LAZY_MODE"] = "0"
os.environ["PT_ENABLE_INT64_SUPPORT"] = "1"


def pytest_unconfigure(config):
os.environ.clear()
os.environ.update(config.stash[backup_env])
45 changes: 44 additions & 1 deletion test/test_auto_round_hpu_only.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,46 @@
import pytest
import torch
from auto_round.utils import is_hpu_supported

from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy


def run_opt_125m_on_hpu():
from auto_round import AutoRound
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

bits, group_size, sym = 4, 128, False
autoround = AutoRound(
model,
tokenizer,
bits=bits,
group_size=group_size,
sym=sym,
iters=2,
seqlen=2,
)
q_model, qconfig = autoround.quantize()
assert q_model is not None, f"Expected q_model to be not None"


@pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported")
@pytest.mark.skipif(not is_pytest_mode_lazy(), reason="Only for lazy mode")
def test_opt_125m_lazy_mode():
run_opt_125m_on_hpu()


@pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported")
@pytest.mark.skipif(not is_pytest_mode_compile(), reason="Only for compile mode")
def test_opt_125m_compile_mode():
torch._dynamo.reset()
run_opt_125m_on_hpu()


def test_import():
from auto_round import AutoRound
from auto_round.export.export_to_itrex.export import save_quantized_as_itrex, WeightOnlyLinear
from auto_round.export.export_to_itrex.export import (
WeightOnlyLinear, save_quantized_as_itrex)

0 comments on commit 770295e

Please sign in to comment.