add cuda ut

Signed-off-by: n1ck-guo <[email protected]>
intel · Dec 31, 2024 · b0f96a0 · b0f96a0
1 parent e525f97
commit b0f96a0
Showing 1 changed file with 59 additions and 0 deletions.
diff --git a/test_cuda/test_gguf.py b/test_cuda/test_gguf.py
@@ -0,0 +1,59 @@
+import copy
+import shutil
+import sys
+import unittest
+
+sys.path.insert(0, "..")
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from auto_round import AutoRound
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRound(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.llm_dataloader = LLMDataLoader()
+
+    # @classmethod
+    # def tearDownClass(self):
+    #     shutil.rmtree("./saved", ignore_errors=True)
+    #     shutil.rmtree("runs", ignore_errors=True)
+
+    def test_gguf_format(self):
+        bits, group_size, sym = 4, 32, False
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"    
+        autoround.save_quantized(output_dir=quantized_model_path, format="gguf:q4_1")
+
+        from llama_cpp import Llama
+        llm = Llama("saved/Qwen2.5-0.5B-Instruct-Q4_1.gguf", n_gpu_layers=-1)
+        output = llm("There is a girl who likes adventure,", max_tokens=32)
+        print(output)
+
+
+if __name__ == "__main__":
+    unittest.main()