Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] dry_test #242

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pytest-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8.18", "3.9.19", "3.10.14"]
python-version: ["3.8.18"]

steps:
- uses: szenius/[email protected]
Expand Down
2 changes: 1 addition & 1 deletion examples/custom_pytorch_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def load_hf_model(model_args: ModelArguments):

evaluator = Evaluator(
model_args=ModelArguments(
model_name_or_path="../Phi-3-mini-128k-instruct",
model_name_or_path="../your-model-path",
model_type="chat",
model_backend="huggingface",
prefix_caching=False,
Expand Down
Empty file added tests/__init__.py
Empty file.
Empty file added tests/dry_test/__init__.py
Empty file.
74 changes: 74 additions & 0 deletions tests/dry_test/test_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pytest

from ..fixtures import run_evaluate

datasets = {
"agieval": [],
"alpaca_eval": None,
"anli": [],
"arc": [],
"bbh": [],
"boolq": [],
"cb": [],
"ceval": [],
"cmmlu": [],
"cnn_dailymail": [],
"color_objects": [],
"commonsenseqa": [],
"copa": [],
"coqa": None,
"crows_pairs": None,
"drop": [],
"gaokao": [],
"gsm8k": [],
"gpqa": [],
"halueval": [],
"hellaswag": [],
"humaneval": ["--pass_at_k", "1"],
"ifeval": [],
"lambada": [],
"math": [],
"mbpp": ["--pass_at_k", "1"],
"mmlu": [],
"mrpc": [],
"mt_bench": None,
"nq": [],
"openbookqa": [],
"penguins_in_a_table": [],
"piqa": [],
"qnli": [],
"quac": [],
"race": [],
"real_toxicity_prompts": None,
"rte": [],
"siqa": [],
"squad": [],
"squad_v2": [],
"story_cloze": None,
"tldr": [],
"triviaqa": [],
"truthfulqa_mc": [],
"tydiqa": [],
"vicuna_bench": None,
"webq": [],
"wic": [],
"winogender": [],
"winograd": [],
"winogrande": [],
"wmt16:de-en": [],
"wsc": [],
"xcopa": [],
"xlsum": [],
"xsum": [],
}


@pytest.mark.parametrize("dataset, extra_args", datasets.items())
def test_datasets_dry_run(run_evaluate, dataset, extra_args):
if extra_args is None:
return
run_evaluate(["-m", "gpt-3.5-turbo", "-d", dataset, "-b", "10", "--dry_run", "--cuda", "0", "--openai_api_key", "fake_key"] + extra_args)


def test_crows_pairs_dry_run(run_evaluate):
run_evaluate(["-m", "gpt2", "-d", "crows_pairs", "-b", "10", "--dry_run", "--cuda", "0"])
21 changes: 21 additions & 0 deletions tests/dry_test/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytest

from ..fixtures import run_evaluate

models = {
"gpt-3.5-turbo": ["--openai_api_key", "fake_key"],
"claude-3-haiku-20240307": ["--anthropic_api_key", "fake_key"],
"qwen-turbo": ["--dashscope_api_key", "fake_key"],
"ERNIE-Speed": ["--qianfan_access_key", "fake_key", "--qianfan_secret_key", "fake_key"],
"gpt2": ["--vllm", "False", "--prefix_caching", "True", "--cuda", "0"],
"gpt2": ["--vllm", "True", "--prefix_caching", "False", "--cuda", "0"],
}


@pytest.mark.parametrize("dataset", ["gsm8k", "hellaswag", "mmlu"])
@pytest.mark.parametrize("model, extra_args", models.items())
def test_models_dry_run(run_evaluate, model, dataset, extra_args):
if extra_args is None:
return
run_evaluate(["-m", model, "-d", dataset, "-b", "10", "--dry_run"] + extra_args)

23 changes: 23 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import List

import pytest

from utilization import Evaluator, parse_argument


@pytest.fixture
def run_evaluate():
def evaluate(args: List[str]):
model_args, dataset_args, evaluation_args = parse_argument(
args=args,
initalize=True,
)

evaluator = Evaluator(
model_args=model_args,
dataset_args=dataset_args,
evaluation_args=evaluation_args,
initalize=False,
)
return evaluator.evaluate()
return evaluate
35 changes: 34 additions & 1 deletion tests/requirements-tests.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,36 @@
# pytest
pytest
pytest-md
pytest-emoji
pytest-emoji

torch
transformers
safetensors
tokenizers
datasets>=2.16.1
coloredlogs
tqdm>=4.58.0
jinja2

# Efficient inference
packaging
vllm
flash-attn # https://github.com/Dao-AILab/flash-attention/issues/453#issuecomment-1692867770

# API Models
anthropic
dashscope
qianfan
openai>=1.0.0
tiktoken>=0.5.0

# Metrics
nltk
sacrebleu
rouge_score
langcodes
language_data
google-api-python-client
immutabledict
langdetect

2 changes: 0 additions & 2 deletions tests/test_empty.py

This file was deleted.

4 changes: 4 additions & 0 deletions utilization/dataset/crows_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ class Crows_pairs(MultipleChoiceDataset):
example_set = None
load_args = ("crows_pairs",)

def init_arguments(self):
if self.model_evaluation_method != "get_ppl":
raise ValueError("CrowS-Pairs dataset only supports PPL evaluation method.")

def format_instance(self, instance):
# source text is empty
options = [" " + instance["sent_more"], " " + instance["sent_less"]]
Expand Down
2 changes: 1 addition & 1 deletion utilization/dataset/gaokao.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from logging import getLogger

from ..metric import Gaokao_bench_metric
from .enum import GAOKAO_TASKS
from .dataset_enum import GAOKAO_TASKS
from .generation_dataset import GenerationDataset

logger = getLogger(__name__)
Expand Down
15 changes: 11 additions & 4 deletions utilization/dataset/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,17 @@ def _import_dataset_class(dataset_name: str) -> Type[Dataset]:
module = importlib.import_module(module_path)
except ModuleNotFoundError as e:
all_datasets = list_datasets()
fuzzy_match = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6)
if len(fuzzy_match) == 0:
fuzzy_match = all_datasets
raise ValueError(f"Invalid dataset: {dataset_name}. Possible choices are: {fuzzy_match}.") from e

if f"utilization.dataset.{dataset_name}" in str(e):
matches = difflib.get_close_matches(dataset_name, list(all_datasets), cutoff=0.6)
if len(matches) == 0:
fuzzy_match = f" Available choices are: {all_datasets}."
else:
fuzzy_match = f" Possible choices are: {matches}."
else:
fuzzy_match = ""

raise ValueError(f"Invalid dataset: {dataset_name}.{fuzzy_match}\n{e}") from e
clsmembers = inspect.getmembers(module, inspect.isclass)

for name, obj in clsmembers:
Expand Down
2 changes: 1 addition & 1 deletion utilization/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def set_tokenizer(
self.tokenizer_decode = tokenizer.decode

def _apply_normalization(self, conversations: List[Conversation]):
normalized_conversations = [Conversation.from_chat(assistant=conv[-1]) for conv in conversations]
normalized_conversations = [Conversation.from_chat(assistant=conv[-1]["content"]) for conv in conversations]
conversations.extend(normalized_conversations)

def prompt_token_nums(self, prompt: str):
Expand Down
10 changes: 10 additions & 0 deletions utilization/utils/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,10 @@ def add(
if other is None:
messages = []
if user:
assert isinstance(user, str)
messages.append({"role": "user", "content": user})
if assistant:
assert isinstance(assistant, str)
messages.append({"role": "assistant", "content": assistant})
else:
messages = other.messages
Expand All @@ -364,11 +366,19 @@ def add_(
if other is None:
messages = []
if user:
assert isinstance(user, str)
messages.append({"role": "user", "content": user})
if assistant:
assert isinstance(assistant, str)
messages.append({"role": "assistant", "content": assistant})
else:
messages = other.messages
# add a copy of other messages
self.messages.extend(messages)
return self

def __repr__(self):
output = f"Conversation id: {self.uuid}\n"
for message in self.messages:
output += f" > {message['role']}: {message['content']}\n"
return output