Skip to content

Commit

Permalink
refine AuoRound format and support marlin repacking (#280)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
wenhuach21 and pre-commit-ci[bot] authored Oct 21, 2024
1 parent 7cfff96 commit 68138e8
Show file tree
Hide file tree
Showing 12 changed files with 1,222 additions and 273 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ asymmetric kernel has issues** that can cause considerable accuracy drops, parti
models.
Additionally, symmetric quantization tends to perform poorly at 2-bit precision.

**AutoAWQ Format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted
**AutoAWQ Format**(>0.3.0): This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted
within the community, only 4-bits quantization is supported. It features
specialized layer fusion tailored for Llama models.

Expand Down Expand Up @@ -230,13 +230,13 @@ in [Gaudi Guide](https://docs.habana.ai/en/latest/).
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_round import AutoRoundConfig
device = "auto" ##cpu, hpu, cuda
backend = "auto" ##cpu, hpu, cuda, cuda:marlin('pip install -v gptqmodel --no-build-isolation')
quantization_config = AutoRoundConfig(
backend=device
backend=backend
)
quantized_model_path = "./tmp_autoround"
model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
device_map=device, quantization_config=quantization_config)
device_map=backend.split(':')[0], quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
Expand Down
6 changes: 4 additions & 2 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def setup_parser():

parser.add_argument("--format", default=None, type=str,
help="The format in which to save the model. "
"The options are 'auto_round', 'auto_round:gptq','auto_round:marlin',"
"The options are 'auto_round', 'auto_round:gptq','auto_round:awq',"
" 'auto_gptq', 'auto_awq', 'itrex', 'itrex_xpu' and 'fake'."
"default to 'auto_round."
)
Expand Down Expand Up @@ -316,7 +316,9 @@ def tune(args):
format_list = args.format.replace(' ', '').split(',')
inplace = False if len(format_list) > 1 else True
for format_ in format_list:
eval_folder = f'{export_dir}-{format_}'
save_format_ = format_.replace(":", "-")
save_format_ = save_format_.replace("_", "-")
eval_folder = f'{export_dir}-{save_format_}'
autoround.save_quantized(eval_folder, format=format_, inplace=inplace)

lm_eval_version = get_library_version("lm-eval")
Expand Down
Loading

0 comments on commit 68138e8

Please sign in to comment.