From 7bf87f7648b6a20d45800961fe6bf5aa5d55dcf4 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 18 Jun 2024 04:46:33 -0400 Subject: [PATCH 1/3] fix bug and limit numpy version Signed-off-by: yintong-lu --- auto_round/utils.py | 1 + examples/language-modeling/requirements.txt | 1 + requirements.txt | 1 + 3 files changed, 3 insertions(+) diff --git a/auto_round/utils.py b/auto_round/utils.py index 47c4ef2f..fff4a52e 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -660,6 +660,7 @@ def dynamic_import_inference_linear(bits, group_size, backend): elif bits == 4 and "exllamav2" in backend: logger.warning_once("Please install auto-round from source to enable exllamav2 kernels, switch to triton " "kernels for now") + from auto_round_extension.cuda.qliner_triton import QuantLinear else: from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt index 292166e4..13d67e77 100644 --- a/examples/language-modeling/requirements.txt +++ b/examples/language-modeling/requirements.txt @@ -17,4 +17,5 @@ auto-gptq openpyxl wandb py-cpuinfo +numpy < 2.0 diff --git a/requirements.txt b/requirements.txt index cb8df6bb..0f5c4f7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ sentencepiece torch transformers triton +numpy < 2.0 From 81d99f722a4d6f30237db0e1e75901e69ab65141 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 18 Jun 2024 05:02:11 -0400 Subject: [PATCH 2/3] minor fix Signed-off-by: yintong-lu --- auto_round/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index fff4a52e..0e83f493 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -611,7 +611,6 @@ def get_autogptq_backend_config(backend, bits=4): def warning_once(logger, msg: str): logger.warning(msg) -logger.warning_once = warning_once def dynamic_import_inference_linear(bits, group_size, backend): """Dynamically imports and returns the appropriate QuantLinear class based on the given bits and backend. @@ -658,7 +657,7 @@ def dynamic_import_inference_linear(bits, group_size, backend): if bits == 4 and exllama2_available and "exllamav2" in backend: from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear elif bits == 4 and "exllamav2" in backend: - logger.warning_once("Please install auto-round from source to enable exllamav2 kernels, switch to triton " + warning_once(logger, msg="Please install auto-round from source to enable exllamav2 kernels, switch to triton " "kernels for now") from auto_round_extension.cuda.qliner_triton import QuantLinear else: From b8326a8c6de7a3ee6317f790bc544a4428dd6a2c Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 18 Jun 2024 05:40:01 -0400 Subject: [PATCH 3/3] modify warning_once Signed-off-by: yintong-lu --- auto_round/utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index 0e83f493..2b8d817b 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -25,6 +25,13 @@ import torch from torch.amp import autocast +from functools import lru_cache +@lru_cache(None) +def warning_once(self, msg: str): + self.warning(msg) + + +logging.Logger.warning_once = warning_once logger = logging.getLogger("autoround") logger.setLevel(logging.INFO) logger.propagate = False @@ -35,7 +42,6 @@ import importlib import transformers -from functools import lru_cache class LazyImport(object): """Lazy import python module till use.""" @@ -607,10 +613,6 @@ def get_autogptq_backend_config(backend, bits=4): use_triton = False return use_triton, disable_exllamav1, disable_exllamav2, use_qigen, disable_marlin -@lru_cache(None) -def warning_once(logger, msg: str): - logger.warning(msg) - def dynamic_import_inference_linear(bits, group_size, backend): """Dynamically imports and returns the appropriate QuantLinear class based on the given bits and backend. @@ -657,7 +659,7 @@ def dynamic_import_inference_linear(bits, group_size, backend): if bits == 4 and exllama2_available and "exllamav2" in backend: from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear elif bits == 4 and "exllamav2" in backend: - warning_once(logger, msg="Please install auto-round from source to enable exllamav2 kernels, switch to triton " + logger.warning_once("Please install auto-round from source to enable exllamav2 kernels, switch to triton " "kernels for now") from auto_round_extension.cuda.qliner_triton import QuantLinear else: