diff --git a/.github/workflows/pytest-check.yml b/.github/workflows/pytest-check.yml
index 14e666c8..ca0d9152 100644
--- a/.github/workflows/pytest-check.yml
+++ b/.github/workflows/pytest-check.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8.18", "3.9.19", "3.10.14"]
+        python-version: ["3.8.18"]
 
     steps:
       - uses: szenius/set-timezone@v1.2
diff --git a/examples/custom_pytorch_model.py b/examples/custom_pytorch_model.py
index 4dc26987..52377106 100644
--- a/examples/custom_pytorch_model.py
+++ b/examples/custom_pytorch_model.py
@@ -25,7 +25,7 @@ def load_hf_model(model_args: ModelArguments):
 
 evaluator = Evaluator(
     model_args=ModelArguments(
-        model_name_or_path="../Phi-3-mini-128k-instruct",
+        model_name_or_path="../your-model-path",
         model_type="chat",
         model_backend="huggingface",
         prefix_caching=False,
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/dry_test/__init__.py b/tests/dry_test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/dry_test/test_datasets.py b/tests/dry_test/test_datasets.py
new file mode 100644
index 00000000..a809e29c
--- /dev/null
+++ b/tests/dry_test/test_datasets.py
@@ -0,0 +1,74 @@
+import pytest
+
+from ..fixtures import run_evaluate
+
+datasets = {
+    "agieval": [],
+    "alpaca_eval": None,
+    "anli": [],
+    "arc": [],
+    "bbh": [],
+    "boolq": [],
+    "cb": [],
+    "ceval": [],
+    "cmmlu": [],
+    "cnn_dailymail": [],
+    "color_objects": [],
+    "commonsenseqa": [],
+    "copa": [],
+    "coqa": None,
+    "crows_pairs": None,
+    "drop": [],
+    "gaokao": [],
+    "gsm8k": [],
+    "gpqa": [],
+    "halueval": [],
+    "hellaswag": [],
+    "humaneval": ["--pass_at_k", "1"],
+    "ifeval": [],
+    "lambada": [],
+    "math": [],
+    "mbpp": ["--pass_at_k", "1"],
+    "mmlu": [],
+    "mrpc": [],
+    "mt_bench": None,
+    "nq": [],
+    "openbookqa": [],
+    "penguins_in_a_table": [],
+    "piqa": [],
+    "qnli": [],
+    "quac": [],
+    "race": [],
+    "real_toxicity_prompts": None,
+    "rte": [],
+    "siqa": [],
+    "squad": [],
+    "squad_v2": [],
+    "story_cloze": None,
+    "tldr": [],
+    "triviaqa": [],
+    "truthfulqa_mc": [],
+    "tydiqa": [],
+    "vicuna_bench": None,
+    "webq": [],
+    "wic": [],
+    "winogender": [],
+    "winograd": [],
+    "winogrande": [],
+    "wmt16:de-en": [],
+    "wsc": [],
+    "xcopa": [],
+    "xlsum": [],
+    "xsum": [],
+}
+
+
+@pytest.mark.parametrize("dataset, extra_args", datasets.items())
+def test_datasets_dry_run(run_evaluate, dataset, extra_args):
+    if extra_args is None:
+        return
+    run_evaluate(["-m", "gpt-3.5-turbo", "-d", dataset, "-b", "10", "--dry_run", "--cuda", "0", "--openai_api_key", "fake_key"] + extra_args)
+
+
+def test_crows_pairs_dry_run(run_evaluate):
+    run_evaluate(["-m", "gpt2", "-d", "crows_pairs", "-b", "10", "--dry_run", "--cuda", "0"])
diff --git a/tests/dry_test/test_models.py b/tests/dry_test/test_models.py
new file mode 100644
index 00000000..f079e913
--- /dev/null
+++ b/tests/dry_test/test_models.py
@@ -0,0 +1,21 @@
+import pytest
+
+from ..fixtures import run_evaluate
+
+models = {
+    "gpt-3.5-turbo": ["--openai_api_key", "fake_key"],
+    "claude-3-haiku-20240307": ["--anthropic_api_key", "fake_key"],
+    "qwen-turbo": ["--dashscope_api_key", "fake_key"],
+    "ERNIE-Speed": ["--qianfan_access_key", "fake_key", "--qianfan_secret_key", "fake_key"],
+    "gpt2": ["--vllm", "False", "--prefix_caching", "True", "--cuda", "0"],
+    "gpt2": ["--vllm", "True", "--prefix_caching", "False", "--cuda", "0"],
+}
+
+
+@pytest.mark.parametrize("dataset", ["gsm8k", "hellaswag", "mmlu"])
+@pytest.mark.parametrize("model, extra_args", models.items())
+def test_models_dry_run(run_evaluate, model, dataset, extra_args):
+    if extra_args is None:
+        return
+    run_evaluate(["-m", model, "-d", dataset, "-b", "10", "--dry_run"] + extra_args)
+
diff --git a/tests/fixtures.py b/tests/fixtures.py
new file mode 100644
index 00000000..25e3df47
--- /dev/null
+++ b/tests/fixtures.py
@@ -0,0 +1,23 @@
+from typing import List
+
+import pytest
+
+from utilization import Evaluator, parse_argument
+
+
+@pytest.fixture
+def run_evaluate():
+    def evaluate(args: List[str]):
+        model_args, dataset_args, evaluation_args = parse_argument(
+            args=args,
+            initalize=True,
+        )
+
+        evaluator = Evaluator(
+            model_args=model_args,
+            dataset_args=dataset_args,
+            evaluation_args=evaluation_args,
+            initalize=False,
+        )
+        return evaluator.evaluate()
+    return evaluate
diff --git a/tests/requirements-tests.txt b/tests/requirements-tests.txt
index 6ddd71c7..787293d6 100644
--- a/tests/requirements-tests.txt
+++ b/tests/requirements-tests.txt
@@ -1,3 +1,36 @@
+# pytest
 pytest
 pytest-md
-pytest-emoji
\ No newline at end of file
+pytest-emoji
+
+torch
+transformers
+safetensors
+tokenizers
+datasets>=2.16.1
+coloredlogs
+tqdm>=4.58.0
+jinja2
+
+# Efficient inference
+packaging
+vllm
+flash-attn  # https://github.com/Dao-AILab/flash-attention/issues/453#issuecomment-1692867770
+
+# API Models
+anthropic
+dashscope
+qianfan
+openai>=1.0.0
+tiktoken>=0.5.0
+
+# Metrics
+nltk
+sacrebleu
+rouge_score
+langcodes
+language_data
+google-api-python-client
+immutabledict
+langdetect
+
diff --git a/tests/test_empty.py b/tests/test_empty.py
deleted file mode 100644
index 80d2166c..00000000
--- a/tests/test_empty.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def test_empty():
-    assert True
diff --git a/utilization/dataset/crows_pairs.py b/utilization/dataset/crows_pairs.py
index 2cc35d97..48701bbc 100644
--- a/utilization/dataset/crows_pairs.py
+++ b/utilization/dataset/crows_pairs.py
@@ -20,6 +20,10 @@ class Crows_pairs(MultipleChoiceDataset):
     example_set = None
     load_args = ("crows_pairs",)
 
+    def init_arguments(self):
+        if self.model_evaluation_method != "get_ppl":
+            raise ValueError("CrowS-Pairs dataset only supports PPL evaluation method.")
+
     def format_instance(self, instance):
         # source text is empty
         options = [" " + instance["sent_more"], " " + instance["sent_less"]]
diff --git a/utilization/dataset/gaokao.py b/utilization/dataset/gaokao.py
index 315fd09b..8c7759dd 100644
--- a/utilization/dataset/gaokao.py
+++ b/utilization/dataset/gaokao.py
@@ -3,7 +3,7 @@
 from logging import getLogger
 
 from ..metric import Gaokao_bench_metric
-from .enum import GAOKAO_TASKS
+from .dataset_enum import GAOKAO_TASKS
 from .generation_dataset import GenerationDataset
 
 logger = getLogger(__name__)
diff --git a/utilization/dataset/load.py b/utilization/dataset/load.py
index 769c773a..689581ae 100644
--- a/utilization/dataset/load.py
+++ b/utilization/dataset/load.py
@@ -35,10 +35,17 @@ def _import_dataset_class(dataset_name: str) -> Type[Dataset]:
         module = importlib.import_module(module_path)
     except ModuleNotFoundError as e:
         all_datasets = list_datasets()
-        fuzzy_match = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6)
-        if len(fuzzy_match) == 0:
-            fuzzy_match = all_datasets
-        raise ValueError(f"Invalid dataset: {dataset_name}. Possible choices are: {fuzzy_match}.") from e
+
+        if f"utilization.dataset.{dataset_name}" in str(e):
+            matches = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6)
+            if len(matches) == 0:
+                fuzzy_match = f" Available choices are: {all_datasets}."
+            else:
+                fuzzy_match = f" Possible choices are: {matches}."
+        else:
+            fuzzy_match = ""
+
+        raise ValueError(f"Invalid dataset: {dataset_name}.{fuzzy_match}\n{e}") from e
     clsmembers = inspect.getmembers(module, inspect.isclass)
 
     for name, obj in clsmembers:
diff --git a/utilization/dataset/utils.py b/utilization/dataset/utils.py
index 5c423a7e..d4bfa29e 100644
--- a/utilization/dataset/utils.py
+++ b/utilization/dataset/utils.py
@@ -39,7 +39,7 @@ def set_tokenizer(
             self.tokenizer_decode = tokenizer.decode
 
     def _apply_normalization(self, conversations: List[Conversation]):
-        normalized_conversations = [Conversation.from_chat(assistant=conv[-1]) for conv in conversations]
+        normalized_conversations = [Conversation.from_chat(assistant=conv[-1]["content"]) for conv in conversations]
         conversations.extend(normalized_conversations)
 
     def prompt_token_nums(self, prompt: str):
diff --git a/utilization/utils/conversation.py b/utilization/utils/conversation.py
index 0c2aebc9..8d262bb4 100644
--- a/utilization/utils/conversation.py
+++ b/utilization/utils/conversation.py
@@ -344,8 +344,10 @@ def add(
         if other is None:
             messages = []
             if user:
+                assert isinstance(user, str)
                 messages.append({"role": "user", "content": user})
             if assistant:
+                assert isinstance(assistant, str)
                 messages.append({"role": "assistant", "content": assistant})
         else:
             messages = other.messages
@@ -364,11 +366,19 @@ def add_(
         if other is None:
             messages = []
             if user:
+                assert isinstance(user, str)
                 messages.append({"role": "user", "content": user})
             if assistant:
+                assert isinstance(assistant, str)
                 messages.append({"role": "assistant", "content": assistant})
         else:
             messages = other.messages
         # add a copy of other messages
         self.messages.extend(messages)
         return self
+
+    def __repr__(self):
+        output = f"Conversation id: {self.uuid}\n"
+        for message in self.messages:
+            output += f" > {message['role']}: {message['content']}\n"
+        return output