From 7bf87f7648b6a20d45800961fe6bf5aa5d55dcf4 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 18 Jun 2024 04:46:33 -0400
Subject: [PATCH 1/3] fix bug and limit numpy version

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/utils.py                         | 1 +
 examples/language-modeling/requirements.txt | 1 +
 requirements.txt                            | 1 +
 3 files changed, 3 insertions(+)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index 47c4ef2f..fff4a52e 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -660,6 +660,7 @@ def dynamic_import_inference_linear(bits, group_size, backend):
     elif bits == 4 and "exllamav2" in backend:
         logger.warning_once("Please install auto-round from source to enable exllamav2 kernels, switch to triton "
                              "kernels for now")
+        from auto_round_extension.cuda.qliner_triton import QuantLinear
     else:
         from auto_round_extension.cuda.qliner_triton import QuantLinear
     return QuantLinear
diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt
index 292166e4..13d67e77 100644
--- a/examples/language-modeling/requirements.txt
+++ b/examples/language-modeling/requirements.txt
@@ -17,4 +17,5 @@ auto-gptq
 openpyxl
 wandb
 py-cpuinfo
+numpy < 2.0
 
diff --git a/requirements.txt b/requirements.txt
index cb8df6bb..0f5c4f7a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ sentencepiece
 torch
 transformers
 triton
+numpy < 2.0

From 81d99f722a4d6f30237db0e1e75901e69ab65141 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 18 Jun 2024 05:02:11 -0400
Subject: [PATCH 2/3] minor fix

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index fff4a52e..0e83f493 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -611,7 +611,6 @@ def get_autogptq_backend_config(backend, bits=4):
 def warning_once(logger, msg: str):
     logger.warning(msg)
 
-logger.warning_once = warning_once
 def dynamic_import_inference_linear(bits, group_size, backend):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given bits and backend.
 
@@ -658,7 +657,7 @@ def dynamic_import_inference_linear(bits, group_size, backend):
     if bits == 4 and exllama2_available and "exllamav2" in backend:
         from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
     elif bits == 4 and "exllamav2" in backend:
-        logger.warning_once("Please install auto-round from source to enable exllamav2 kernels, switch to triton "
+        warning_once(logger, msg="Please install auto-round from source to enable exllamav2 kernels, switch to triton "
                              "kernels for now")
         from auto_round_extension.cuda.qliner_triton import QuantLinear
     else:

From b8326a8c6de7a3ee6317f790bc544a4428dd6a2c Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 18 Jun 2024 05:40:01 -0400
Subject: [PATCH 3/3] modify warning_once

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/utils.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index 0e83f493..2b8d817b 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -25,6 +25,13 @@
 import torch
 from torch.amp import autocast
 
+from functools import lru_cache
+@lru_cache(None)
+def warning_once(self, msg: str):
+    self.warning(msg)
+
+
+logging.Logger.warning_once = warning_once
 logger = logging.getLogger("autoround")
 logger.setLevel(logging.INFO)
 logger.propagate = False
@@ -35,7 +42,6 @@
 
 import importlib
 import transformers
-from functools import lru_cache
 
 class LazyImport(object):
     """Lazy import python module till use."""
@@ -607,10 +613,6 @@ def get_autogptq_backend_config(backend, bits=4):
         use_triton = False
     return use_triton, disable_exllamav1, disable_exllamav2, use_qigen, disable_marlin
 
-@lru_cache(None)
-def warning_once(logger, msg: str):
-    logger.warning(msg)
-
 def dynamic_import_inference_linear(bits, group_size, backend):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given bits and backend.
 
@@ -657,7 +659,7 @@ def dynamic_import_inference_linear(bits, group_size, backend):
     if bits == 4 and exllama2_available and "exllamav2" in backend:
         from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
     elif bits == 4 and "exllamav2" in backend:
-        warning_once(logger, msg="Please install auto-round from source to enable exllamav2 kernels, switch to triton "
+        logger.warning_once("Please install auto-round from source to enable exllamav2 kernels, switch to triton "
                              "kernels for now")
         from auto_round_extension.cuda.qliner_triton import QuantLinear
     else: