diff --git a/.github/workflows/pytest-check.yml b/.github/workflows/pytest-check.yml index 14e666c8..ca0d9152 100644 --- a/.github/workflows/pytest-check.yml +++ b/.github/workflows/pytest-check.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8.18", "3.9.19", "3.10.14"] + python-version: ["3.8.18"] steps: - uses: szenius/set-timezone@v1.2 diff --git a/examples/custom_pytorch_model.py b/examples/custom_pytorch_model.py index 4dc26987..52377106 100644 --- a/examples/custom_pytorch_model.py +++ b/examples/custom_pytorch_model.py @@ -25,7 +25,7 @@ def load_hf_model(model_args: ModelArguments): evaluator = Evaluator( model_args=ModelArguments( - model_name_or_path="../Phi-3-mini-128k-instruct", + model_name_or_path="../your-model-path", model_type="chat", model_backend="huggingface", prefix_caching=False, diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/dry_test/__init__.py b/tests/dry_test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/dry_test/test_datasets.py b/tests/dry_test/test_datasets.py new file mode 100644 index 00000000..a809e29c --- /dev/null +++ b/tests/dry_test/test_datasets.py @@ -0,0 +1,74 @@ +import pytest + +from ..fixtures import run_evaluate + +datasets = { + "agieval": [], + "alpaca_eval": None, + "anli": [], + "arc": [], + "bbh": [], + "boolq": [], + "cb": [], + "ceval": [], + "cmmlu": [], + "cnn_dailymail": [], + "color_objects": [], + "commonsenseqa": [], + "copa": [], + "coqa": None, + "crows_pairs": None, + "drop": [], + "gaokao": [], + "gsm8k": [], + "gpqa": [], + "halueval": [], + "hellaswag": [], + "humaneval": ["--pass_at_k", "1"], + "ifeval": [], + "lambada": [], + "math": [], + "mbpp": ["--pass_at_k", "1"], + "mmlu": [], + "mrpc": [], + "mt_bench": None, + "nq": [], + "openbookqa": [], + "penguins_in_a_table": [], + "piqa": [], + "qnli": [], + "quac": [], + "race": [], + "real_toxicity_prompts": None, + "rte": [], + "siqa": [], + "squad": [], + "squad_v2": [], + "story_cloze": None, + "tldr": [], + "triviaqa": [], + "truthfulqa_mc": [], + "tydiqa": [], + "vicuna_bench": None, + "webq": [], + "wic": [], + "winogender": [], + "winograd": [], + "winogrande": [], + "wmt16:de-en": [], + "wsc": [], + "xcopa": [], + "xlsum": [], + "xsum": [], +} + + +@pytest.mark.parametrize("dataset, extra_args", datasets.items()) +def test_datasets_dry_run(run_evaluate, dataset, extra_args): + if extra_args is None: + return + run_evaluate(["-m", "gpt-3.5-turbo", "-d", dataset, "-b", "10", "--dry_run", "--cuda", "0", "--openai_api_key", "fake_key"] + extra_args) + + +def test_crows_pairs_dry_run(run_evaluate): + run_evaluate(["-m", "gpt2", "-d", "crows_pairs", "-b", "10", "--dry_run", "--cuda", "0"]) diff --git a/tests/dry_test/test_models.py b/tests/dry_test/test_models.py new file mode 100644 index 00000000..f079e913 --- /dev/null +++ b/tests/dry_test/test_models.py @@ -0,0 +1,21 @@ +import pytest + +from ..fixtures import run_evaluate + +models = { + "gpt-3.5-turbo": ["--openai_api_key", "fake_key"], + "claude-3-haiku-20240307": ["--anthropic_api_key", "fake_key"], + "qwen-turbo": ["--dashscope_api_key", "fake_key"], + "ERNIE-Speed": ["--qianfan_access_key", "fake_key", "--qianfan_secret_key", "fake_key"], + "gpt2": ["--vllm", "False", "--prefix_caching", "True", "--cuda", "0"], + "gpt2": ["--vllm", "True", "--prefix_caching", "False", "--cuda", "0"], +} + + +@pytest.mark.parametrize("dataset", ["gsm8k", "hellaswag", "mmlu"]) +@pytest.mark.parametrize("model, extra_args", models.items()) +def test_models_dry_run(run_evaluate, model, dataset, extra_args): + if extra_args is None: + return + run_evaluate(["-m", model, "-d", dataset, "-b", "10", "--dry_run"] + extra_args) + diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 00000000..25e3df47 --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,23 @@ +from typing import List + +import pytest + +from utilization import Evaluator, parse_argument + + +@pytest.fixture +def run_evaluate(): + def evaluate(args: List[str]): + model_args, dataset_args, evaluation_args = parse_argument( + args=args, + initalize=True, + ) + + evaluator = Evaluator( + model_args=model_args, + dataset_args=dataset_args, + evaluation_args=evaluation_args, + initalize=False, + ) + return evaluator.evaluate() + return evaluate diff --git a/tests/requirements-tests.txt b/tests/requirements-tests.txt index 6ddd71c7..787293d6 100644 --- a/tests/requirements-tests.txt +++ b/tests/requirements-tests.txt @@ -1,3 +1,36 @@ +# pytest pytest pytest-md -pytest-emoji \ No newline at end of file +pytest-emoji + +torch +transformers +safetensors +tokenizers +datasets>=2.16.1 +coloredlogs +tqdm>=4.58.0 +jinja2 + +# Efficient inference +packaging +vllm +flash-attn # https://github.com/Dao-AILab/flash-attention/issues/453#issuecomment-1692867770 + +# API Models +anthropic +dashscope +qianfan +openai>=1.0.0 +tiktoken>=0.5.0 + +# Metrics +nltk +sacrebleu +rouge_score +langcodes +language_data +google-api-python-client +immutabledict +langdetect + diff --git a/tests/test_empty.py b/tests/test_empty.py deleted file mode 100644 index 80d2166c..00000000 --- a/tests/test_empty.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_empty(): - assert True diff --git a/utilization/dataset/crows_pairs.py b/utilization/dataset/crows_pairs.py index 2cc35d97..48701bbc 100644 --- a/utilization/dataset/crows_pairs.py +++ b/utilization/dataset/crows_pairs.py @@ -20,6 +20,10 @@ class Crows_pairs(MultipleChoiceDataset): example_set = None load_args = ("crows_pairs",) + def init_arguments(self): + if self.model_evaluation_method != "get_ppl": + raise ValueError("CrowS-Pairs dataset only supports PPL evaluation method.") + def format_instance(self, instance): # source text is empty options = [" " + instance["sent_more"], " " + instance["sent_less"]] diff --git a/utilization/dataset/gaokao.py b/utilization/dataset/gaokao.py index 315fd09b..8c7759dd 100644 --- a/utilization/dataset/gaokao.py +++ b/utilization/dataset/gaokao.py @@ -3,7 +3,7 @@ from logging import getLogger from ..metric import Gaokao_bench_metric -from .enum import GAOKAO_TASKS +from .dataset_enum import GAOKAO_TASKS from .generation_dataset import GenerationDataset logger = getLogger(__name__) diff --git a/utilization/dataset/load.py b/utilization/dataset/load.py index 769c773a..689581ae 100644 --- a/utilization/dataset/load.py +++ b/utilization/dataset/load.py @@ -35,10 +35,17 @@ def _import_dataset_class(dataset_name: str) -> Type[Dataset]: module = importlib.import_module(module_path) except ModuleNotFoundError as e: all_datasets = list_datasets() - fuzzy_match = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6) - if len(fuzzy_match) == 0: - fuzzy_match = all_datasets - raise ValueError(f"Invalid dataset: {dataset_name}. Possible choices are: {fuzzy_match}.") from e + + if f"utilization.dataset.{dataset_name}" in str(e): + matches = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6) + if len(matches) == 0: + fuzzy_match = f" Available choices are: {all_datasets}." + else: + fuzzy_match = f" Possible choices are: {matches}." + else: + fuzzy_match = "" + + raise ValueError(f"Invalid dataset: {dataset_name}.{fuzzy_match}\n{e}") from e clsmembers = inspect.getmembers(module, inspect.isclass) for name, obj in clsmembers: diff --git a/utilization/dataset/utils.py b/utilization/dataset/utils.py index 5c423a7e..d4bfa29e 100644 --- a/utilization/dataset/utils.py +++ b/utilization/dataset/utils.py @@ -39,7 +39,7 @@ def set_tokenizer( self.tokenizer_decode = tokenizer.decode def _apply_normalization(self, conversations: List[Conversation]): - normalized_conversations = [Conversation.from_chat(assistant=conv[-1]) for conv in conversations] + normalized_conversations = [Conversation.from_chat(assistant=conv[-1]["content"]) for conv in conversations] conversations.extend(normalized_conversations) def prompt_token_nums(self, prompt: str): diff --git a/utilization/utils/conversation.py b/utilization/utils/conversation.py index 0c2aebc9..8d262bb4 100644 --- a/utilization/utils/conversation.py +++ b/utilization/utils/conversation.py @@ -344,8 +344,10 @@ def add( if other is None: messages = [] if user: + assert isinstance(user, str) messages.append({"role": "user", "content": user}) if assistant: + assert isinstance(assistant, str) messages.append({"role": "assistant", "content": assistant}) else: messages = other.messages @@ -364,11 +366,19 @@ def add_( if other is None: messages = [] if user: + assert isinstance(user, str) messages.append({"role": "user", "content": user}) if assistant: + assert isinstance(assistant, str) messages.append({"role": "assistant", "content": assistant}) else: messages = other.messages # add a copy of other messages self.messages.extend(messages) return self + + def __repr__(self): + output = f"Conversation id: {self.uuid}\n" + for message in self.messages: + output += f" > {message['role']}: {message['content']}\n" + return output