diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py
index 91e69aa3..357fee95 100644
--- a/test_cuda/test_support_vlms.py
+++ b/test_cuda/test_support_vlms.py
@@ -81,7 +81,7 @@ def test_phi3(self):
         ## test tune
         res = os.system(
             f"cd .. && {self.python_path} -m auto_round --mllm "
-            f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}") 
+            f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}")
         self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail")
 
         ## test infer
@@ -114,11 +114,74 @@ def test_phi3(self):
         image_inputs = Image.open(requests.get(image_url, stream=True).raw)
         inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device) 
 
-        generation_args = { 
+        generation_args = {
+            "max_new_tokens": 1000, 
+            "temperature": 0.0, 
+            "do_sample": False, 
+        }
+
+        generate_ids = model.generate(**inputs, 
+        eos_token_id=processor.tokenizer.eos_token_id, 
+        **generation_args
+        )
+
+        # remove input tokens 
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = processor.batch_decode(generate_ids, 
+        skip_special_tokens=True, 
+        clean_up_tokenization_spaces=False)[0] 
+        print(response)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+        
+    def test_phi3_vision_awq(self):
+        model_path = "/models/Phi-3.5-vision-instruct/"
+        ## test tune
+        res = os.system(
+            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"--model {model_path} --iter 2 --quant_nontext_module "
+            f"--nsample 64 --seqlen 32 "
+            f"--format auto_awq --output_dir {self.save_dir} --device {self.device}")
+        self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail")
+
+        ## test infer
+        from transformers import AutoModelForCausalLM, AutoProcessor
+        from auto_round.export.export_to_awq import WQLinear_GEMM
+        quantized_model_path = os.path.join(self.save_dir, "Phi-3.5-vision-instruct-w4g128-auto_awq")
+        res = os.system(f"cp /models/Phi-3.5-vision-instruct/*.py {quantized_model_path}")
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path, 
+            device_map=f"cuda:{self.device}", 
+            trust_remote_code=True, 
+            torch_dtype="auto"
+        )
+        assert "WQLinear_GEMM" in str(
+                type(model.model.vision_embed_tokens.img_processor.vision_model.encoder.layers[0].mlp.fc1)), \
+                "model quantization failed."
+        processor = AutoProcessor.from_pretrained(quantized_model_path, 
+        trust_remote_code=True, 
+        num_crops=4
+        )
+
+        image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+        content = "Describe this image."
+        messages = [
+            {"role": "user", 
+            "content": "<|image_1|>\n"+content},
+        ]
+
+        prompt = processor.tokenizer.apply_chat_template(
+        messages, 
+        tokenize=False, 
+        add_generation_prompt=True
+        )
+        image_inputs = Image.open(requests.get(image_url, stream=True).raw)
+        inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device) 
+
+        generation_args = {
             "max_new_tokens": 1000, 
             "temperature": 0.0, 
             "do_sample": False, 
-        } 
+        }
 
         generate_ids = model.generate(**inputs, 
         eos_token_id=processor.tokenizer.eos_token_id, 
@@ -272,4 +335,4 @@ def test_72b(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()