From b0f96a0e5fa65939ab42209f8b933af7d4a8478c Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 30 Dec 2024 21:46:47 -0500 Subject: [PATCH] add cuda ut Signed-off-by: n1ck-guo --- test_cuda/test_gguf.py | 59 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 test_cuda/test_gguf.py diff --git a/test_cuda/test_gguf.py b/test_cuda/test_gguf.py new file mode 100644 index 00000000..050efd25 --- /dev/null +++ b/test_cuda/test_gguf.py @@ -0,0 +1,59 @@ +import copy +import shutil +import sys +import unittest + +sys.path.insert(0, "..") +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer + +from auto_round import AutoRound + +class LLMDataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +class TestAutoRound(unittest.TestCase): + @classmethod + def setUpClass(self): + model_name = "Qwen/Qwen2.5-0.5B-Instruct" + self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + self.llm_dataloader = LLMDataLoader() + + # @classmethod + # def tearDownClass(self): + # shutil.rmtree("./saved", ignore_errors=True) + # shutil.rmtree("runs", ignore_errors=True) + + def test_gguf_format(self): + bits, group_size, sym = 4, 32, False + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = "./saved" + autoround.save_quantized(output_dir=quantized_model_path, format="gguf:q4_1") + + from llama_cpp import Llama + llm = Llama("saved/Qwen2.5-0.5B-Instruct-Q4_1.gguf", n_gpu_layers=-1) + output = llm("There is a girl who likes adventure,", max_tokens=32) + print(output) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file