From 499515836872fddff11ac836199986645a93d652 Mon Sep 17 00:00:00 2001 From: school_platform Date: Thu, 28 Mar 2024 15:54:17 +0800 Subject: [PATCH 1/2] added vllm config --- configs/local_models/bert/bert.py | 18 ++ configs/local_models/qwen/qwen_vllm.py | 15 + configs/lyh/t5_all.py | 0 configs/tests/test_model.py | 10 +- configs/tests/test_vllm.py | 69 +++++ opencompass/models/__init__.py | 1 + opencompass/models/bert.py | 392 +++++++++++++++++++++++++ 7 files changed, 499 insertions(+), 6 deletions(-) create mode 100644 configs/local_models/bert/bert.py create mode 100644 configs/local_models/qwen/qwen_vllm.py create mode 100644 configs/lyh/t5_all.py create mode 100644 configs/tests/test_vllm.py create mode 100644 opencompass/models/bert.py diff --git a/configs/local_models/bert/bert.py b/configs/local_models/bert/bert.py new file mode 100644 index 0000000..f6fcec5 --- /dev/null +++ b/configs/local_models/bert/bert.py @@ -0,0 +1,18 @@ +from opencompass.models import Bert + +from mmengine.config import read_base +with read_base(): + from ...paths import ROOT_DIR + +bert_large_cased = dict( + type=Bert, + abbr='bert_large_cased', + path=ROOT_DIR+"models/google-bert/bert-large-cased", + tokenizer_path=ROOT_DIR+"models/google-bert/bert-large-cased", + tokenizer_kwargs=dict(trust_remote_code=True), + max_out_len=400, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) \ No newline at end of file diff --git a/configs/local_models/qwen/qwen_vllm.py b/configs/local_models/qwen/qwen_vllm.py new file mode 100644 index 0000000..950be76 --- /dev/null +++ b/configs/local_models/qwen/qwen_vllm.py @@ -0,0 +1,15 @@ +from opencompass.models import VLLM + +qwen_1_5b_14b_chat_vllm = dict( + type=VLLM, + abbr='qwen-1.5b-14b-chat', + path="/home/junetheriver/models/qwen/Qwen1.5-14B-Chat", + max_seq_len=2048, + model_kwargs=dict(trust_remote_code=True, max_model_len=2048), + generation_kwargs=dict(), + meta_template=None, + mode='none', + batch_size=1, + use_fastchat_template=False, + end_str=None, +) \ No newline at end of file diff --git a/configs/lyh/t5_all.py b/configs/lyh/t5_all.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/tests/test_model.py b/configs/tests/test_model.py index 89be770..87e650b 100644 --- a/configs/tests/test_model.py +++ b/configs/tests/test_model.py @@ -8,24 +8,22 @@ from ..datasets.opseval.datasets import owl_mc, owl_qa # Models from ..local_models.google.t5 import t5_base - from ..local_models.lmsys.vicuna import vicuna_bases - from ..local_models.internlm.internlm import internlm2_bases - from ..local_models.yi.yi import yi_bases - from ..local_models.mistral.mistral import mistral_7b + from ..local_models.bert.bert import bert_large_cased + from ..paths import ROOT_DIR -yi_bases = [model for model in yi_bases if '34' not in model['abbr']] datasets = [ *owl_mc, *owl_qa, ] datasets = [ - dataset for dataset in datasets if 'Zero-shot' in dataset['abbr'] + dataset for dataset in datasets if 'Zero-shot' in dataset['abbr'] and 'zh' in dataset['abbr'] ] models = [ t5_base, + # bert_large_cased, # *vicuna_bases, # *internlm2_bases, # *yi_bases, diff --git a/configs/tests/test_vllm.py b/configs/tests/test_vllm.py new file mode 100644 index 0000000..168a820 --- /dev/null +++ b/configs/tests/test_vllm.py @@ -0,0 +1,69 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import owl_mc, owl_qa + # Models + from ..local_models.google.t5 import t5_base + from ..local_models.bert.bert import bert_large_cased + from ..local_models.qwen.qwen_vllm import qwen_1_5b_14b_chat_vllm + + from ..paths import ROOT_DIR + + +datasets = [ + *owl_mc, *owl_qa, +] + +datasets = [ + dataset for dataset in datasets if 'Zero-shot' in dataset['abbr'] and 'zh' in dataset['abbr'] +] + +models = [ + t5_base, + # bert_large_cased, + qwen_1_5b_14b_chat_vllm, + # *vicuna_bases, + # *internlm2_bases, + # *yi_bases, + # mistral_7b +] + +for model in models: + model['run_cfg'] = dict(num_gpus=1, num_procs=1) + pass + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 2 + dataset['infer_cfg']['inferencer']['max_token_len'] = 20 + dataset['eval_cfg']['sc_size'] = 2 + dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 58f1e87..d35e64c 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -33,6 +33,7 @@ # from .custom import CustomModel # noqa from .wenxin_api import WenXinAI # noqa from .t5 import T5 # noqa +from .bert import Bert # noqa from .pangu_api import PanGu # noqa: F401 from .qwen_api import Qwen # noqa: F401 from .sensetime_api import SenseTime # noqa: F401 diff --git a/opencompass/models/bert.py b/opencompass/models/bert.py new file mode 100644 index 0000000..e535dd6 --- /dev/null +++ b/opencompass/models/bert.py @@ -0,0 +1,392 @@ +import os +from typing import Dict, List, Optional, Union + +import numpy as np +import torch + +from opencompass.models.base import BaseModel +from opencompass.registry import MODELS +from opencompass.utils.logging import get_logger +from opencompass.utils.prompt import PromptList +from transformers import GenerationConfig + +PromptType = Union[PromptList, str] + + +@MODELS.register_module() +class Bert(BaseModel): + """Model wrapper around Bert general models. + + Args: + path (str): The name or path to Bert's model. + hf_cache_dir: Set the cache dir to HF model cache dir. If None, it will + use the env variable HF_MODEL_HUB. Defaults to None. + max_seq_len (int): The maximum length of the input sequence. Defaults + to 2048. + tokenizer_path (str): The path to the tokenizer. Defaults to None. + tokenizer_kwargs (dict): Keyword arguments for the tokenizer. + Defaults to {}. + peft_path (str, optional): The name or path to the Bert's PEFT + model. If None, the original model will not be converted to PEFT. + Defaults to None. + tokenizer_only (bool): If True, only the tokenizer will be initialized. + Defaults to False. + model_kwargs (dict): Keyword arguments for the model, used in loader. + Defaults to dict(device_map='auto'). + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + extract_pred_after_decode (bool): Whether to extract the prediction + string from the decoded output string, instead of extract the + prediction tokens before decoding. Defaults to False. + batch_padding (bool): If False, inference with be performed in for-loop + without batch padding. + + Note: + About ``extract_pred_after_decode``: Commonly, we should extract the + the prediction tokens before decoding. But for some tokenizers using + ``sentencepiece``, like LLaMA, this behavior may change the number of + whitespaces, which is harmful for Python programming tasks. + """ + + def __init__( + self, + path: str, + hf_cache_dir: Optional[str] = None, + max_seq_len: int = 2048, + tokenizer_path: Optional[str] = None, + tokenizer_kwargs: dict = dict(), + peft_path: Optional[str] = None, + tokenizer_only: bool = False, + model_kwargs: dict = dict(device_map='auto'), + meta_template: Optional[Dict] = None, + extract_pred_after_decode: bool = False, + batch_padding: bool = False, + generate_kwargs: dict = None, + ): + super().__init__(path=path, + max_seq_len=max_seq_len, + tokenizer_only=tokenizer_only, + meta_template=meta_template) + from opencompass.utils.fileio import patch_hf_auto_model + if hf_cache_dir is None: + hf_cache_dir = os.getenv('HF_MODEL_HUB', None) + patch_hf_auto_model(hf_cache_dir) + self.logger = get_logger() + self._load_tokenizer(path=path, + tokenizer_path=tokenizer_path, + tokenizer_kwargs=tokenizer_kwargs) + self.batch_padding = batch_padding + self.extract_pred_after_decode = extract_pred_after_decode + self.generate_kwargs = generate_kwargs if generate_kwargs else dict() + if not tokenizer_only: + self._load_model(path=path, + model_kwargs=model_kwargs, + peft_path=peft_path) + + def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], + tokenizer_kwargs: dict): + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_path if tokenizer_path else path, **tokenizer_kwargs) + + # A patch for QwenTokenizer + if self.tokenizer.__class__.__name__ == 'QWenTokenizer': + self.tokenizer.pad_token_id = self.tokenizer.eod_id + self.tokenizer.bos_token_id = self.tokenizer.eod_id + self.tokenizer.eos_token_id = self.tokenizer.eod_id + + if self.tokenizer.pad_token_id is None: + self.logger.warning('pad_token_id is not set for the tokenizer. ' + 'Using eos_token_id as pad_token_id.' + f'Which is {self.tokenizer.eos_token_id}') + self.tokenizer.pad_token = self.tokenizer.eos_token + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + + + # A patch for llama when batch_padding = True + if 'decapoda-research/llama' in path or \ + (tokenizer_path and + 'decapoda-research/llama' in tokenizer_path): + self.logger.warning('We set new pad_token_id for LLaMA model') + # keep consistent with official LLaMA repo + # https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb # noqa + self.tokenizer.bos_token = '' + self.tokenizer.eos_token = '' + self.tokenizer.pad_token_id = 0 + + def _load_model(self, + path: str, + model_kwargs: dict, + peft_path: Optional[str] = None): + from transformers import BertLMHeadModel + + model_kwargs.setdefault('torch_dtype', torch.float16) + self.model = BertLMHeadModel.from_pretrained(path, **model_kwargs) + if peft_path is not None: + from peft import PeftModel + self.model = PeftModel.from_pretrained(self.model, + peft_path, + is_trainable=False) + self.model.eval() + + # A patch for llama when batch_padding = True + if 'decapoda-research/llama' in path: + self.model.config.bos_token_id = 1 + self.model.config.eos_token_id = 2 + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + def generate(self, inputs: List[str], max_out_len: int, + **kwargs) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + kwargs = {**kwargs, **self.generate_kwargs} + + if self.batch_padding and len(inputs) > 1: + return self._batch_generate(inputs=inputs, + max_out_len=max_out_len, + **kwargs) + else: + return sum((self._single_generate( + inputs=[input_], max_out_len=max_out_len, **kwargs) + for input_ in inputs), []) + + def _batch_generate(self, inputs: List[str], max_out_len: int, + **kwargs) -> List[str]: + """Support for batch prompts inference. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + if self.extract_pred_after_decode: + prompt_lens = [len(input_) for input_ in inputs] + + # step-1: tokenize the input with batch_encode_plus + tokens = self.tokenizer.batch_encode_plus(inputs, + padding=True, + truncation=True, + max_length=self.max_seq_len - + max_out_len) + tokens = { + k: torch.tensor(np.array(tokens[k]), device=self.model.device) + for k in tokens if k in ['input_ids', 'attention_mask'] + } + + # step-2: conduct model forward to generate output + outputs = self.model.generate(**tokens, + max_new_tokens=max_out_len, + **kwargs) + + if not self.extract_pred_after_decode: + outputs = outputs[:, tokens['input_ids'].shape[1]:] + + decodeds = self.tokenizer.batch_decode(outputs, + skip_special_tokens=True) + + if self.extract_pred_after_decode: + decodeds = [ + token[len_:] for token, len_ in zip(decodeds, prompt_lens) + ] + + return decodeds + + def _single_generate(self, inputs: List[str], max_out_len: int, + **kwargs) -> List[str]: + """Support for single prompt inference. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + if self.extract_pred_after_decode: + prompt_lens = [len(input_) for input_ in inputs] + + input_ids = self.tokenizer(inputs, + truncation=True, + max_length=self.max_seq_len - + max_out_len)['input_ids'] + input_ids = torch.tensor(input_ids, device=self.model.device) + # To accommodate the PeftModel, parameters should be passed in + # key-value format for generate. + outputs = self.model.generate(input_ids=input_ids, + max_new_tokens=max_out_len, + **kwargs) + + if not self.extract_pred_after_decode: + outputs = outputs[:, input_ids.shape[1]:] + + decodeds = self.tokenizer.batch_decode(outputs, + skip_special_tokens=True) + + if self.extract_pred_after_decode: + decodeds = [ + token[len_:] for token, len_ in zip(decodeds, prompt_lens) + ] + + return decodeds + + def get_logits(self, inputs: List[str]): + + if self.batch_padding and len(inputs) > 1: + # batch inference + tokens = self.tokenizer(inputs, + padding=True, + truncation=True, + max_length=self.max_seq_len) + + tokens = { + k: torch.tensor(np.array(tokens[k]), device=self.model.device) + for k in tokens if k in ['input_ids', 'attention_mask'] + } + outputs = self.model(**tokens) + + else: + inputs = self.tokenizer( + inputs, + padding=False, + truncation=True, + max_length=self.max_seq_len) + input_ids = inputs['input_ids'] + # input_ids = torch.tensor(input_ids, device=self.model.device) + tokens = {'input_ids': input_ids} + + # outputs = self.model(input_ids, decoder_input_ids=input_ids) + outputs = self.model(**inputs, labels=input_ids) + return outputs[0], {'tokens': tokens} + + def get_ppl(self, + inputs: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of perplexity scores. + """ + + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + return self._get_ppl(inputs, mask_length=mask_length) + else: + return np.concatenate([ + self._get_ppl(inputs=[text], mask_length=mask_length) + for text in inputs + ]) + + def _get_ppl(self, + inputs: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of perplexity scores. + """ + + outputs, inputs = self.get_logits(inputs) + shift_logits = outputs[..., :-1, :].contiguous() + + shift_labels = inputs['tokens']['input_ids'][..., 1:].contiguous() + + # if not self.tokenizer.pad_token_id: + # self.tokenizer.pad_token_id = 151643 # TODO: temporally measure!!! PLEASE FIX LATER!! + loss_fct = torch.nn.CrossEntropyLoss( + reduction='none', ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + + if mask_length is not None: + mask = torch.zeros_like(shift_labels) # [batch,seqlen] + for i in range(len(mask)): + for j in range(mask_length[i] - 1, len(mask[i])): + mask[i][j] = 1 + loss = loss * mask + + lens = (inputs['tokens']['input_ids'] != + self.tokenizer.pad_token_id).sum(-1).cpu().numpy() + if mask_length is not None: + lens -= np.array(mask_length) + ce_loss = loss.sum(-1).cpu().detach().to(torch.float).cpu().numpy() / lens # FIXING ERROR BFloat16 unsupported + return ce_loss + + def get_token_len(self, prompt: str) -> int: + """Get lengths of the tokenized strings. + + Args: + prompt (str): Input string. + + Returns: + int: Length of the input tokens + """ + return len(self.tokenizer.encode(prompt)) + + +@MODELS.register_module() +class BertCausalLM(Bert): + """Model wrapper around Bert CausalLM. + + Args: + path (str): The name or path to Bert's model. + hf_cache_dir: Set the cache dir to HF model cache dir. If None, it will + use the env variable HF_MODEL_HUB. Defaults to None. + max_seq_len (int): The maximum length of the input sequence. Defaults + to 2048. + tokenizer_path (str): The path to the tokenizer. Defaults to None. + tokenizer_kwargs (dict): Keyword arguments for the tokenizer. + Defaults to {}. + peft_path (str, optional): The name or path to the Bert's PEFT + model. If None, the original model will not be converted to PEFT. + Defaults to None. + tokenizer_only (bool): If True, only the tokenizer will be initialized. + Defaults to False. + model_kwargs (dict): Keyword arguments for the model, used in loader. + Defaults to dict(device_map='auto'). + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + batch_padding (bool): If False, inference with be performed in for-loop + without batch padding. + """ + + def _load_model(self, + path: str, + model_kwargs: dict, + peft_path: Optional[str] = None): + from transformers import AutoModelForCausalLM + + model_kwargs.setdefault('torch_dtype', torch.float16) + self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs) + if peft_path is not None: + from peft import PeftModel + self.model = PeftModel.from_pretrained(self.model, + peft_path, + is_trainable=False) + self.model.eval() From 646578e9daee823a7cfc532a02cf835284b51142 Mon Sep 17 00:00:00 2001 From: school_platform Date: Mon, 1 Apr 2024 15:41:16 +0800 Subject: [PATCH 2/2] vllm configs and added question to reference in prediction outputs --- configs/datasets/opseval/datasets.py | 35 +++++++--- configs/datasets/opseval/qa_gen.py | 5 +- configs/local_models/baichuan/baichuan.py | 12 +++- configs/local_models/google/gemma.py | 8 ++- configs/local_models/internlm/internlm.py | 13 +++- configs/local_models/lmsys/vicuna.py | 9 ++- configs/local_models/mistral/mistral.py | 6 +- configs/local_models/model_template.py | 18 +++++ configs/local_models/qwen/qwen.py | 21 +++++- configs/local_models/qwen/qwen_vllm.py | 15 ---- configs/local_models/yi/yi.py | 39 +++++++++-- configs/lyh/vllm_qa.py | 69 +++++++++++++++++++ configs/lyh/vllm_qa_1.py | 67 ++++++++++++++++++ configs/lyh/vllm_qa_2.py | 64 +++++++++++++++++ configs/tests/test_model_datasets.py | 69 +++++++++++++++++++ opencompass/openicl/icl_evaluator/__init__.py | 2 +- .../icl_evaluator/opseval_gen_evaluator.py | 42 ++++++++++- .../icl_retriever/icl_base_retriever.py | 3 +- 18 files changed, 450 insertions(+), 47 deletions(-) delete mode 100644 configs/local_models/qwen/qwen_vllm.py create mode 100644 configs/lyh/vllm_qa.py create mode 100644 configs/lyh/vllm_qa_1.py create mode 100644 configs/lyh/vllm_qa_2.py create mode 100644 configs/tests/test_model_datasets.py diff --git a/configs/datasets/opseval/datasets.py b/configs/datasets/opseval/datasets.py index e6895e4..89a9b12 100644 --- a/configs/datasets/opseval/datasets.py +++ b/configs/datasets/opseval/datasets.py @@ -63,15 +63,22 @@ def get_selected_datasets(setting_func, name, path, langs, qtypes): network_mc = network_mc_ppl + network_mc_gen company_mc_list = [ - ('bosc', f'{ROOT_DIR}data/opseval/bosc/splitted', ['zh'], ['single']), + ('bosc', f'{ROOT_DIR}data/opseval/bosc/splitted_v2', ['zh'], ['single', 'multiple']), # ('dfcdata', f'{ROOT_DIR}data/opseval/dfcdata/splitted', ['zh'], ['single']), - ('gtja', f'{ROOT_DIR}data/opseval/gtja/splitted', ['zh'], ['single']), + ('gtja', f'{ROOT_DIR}data/opseval/gtja/splitted_v2', ['zh'], ['single', 'multiple']), ('huaweicloud', f'{ROOT_DIR}data/opseval/huaweicloud/splitted', ['zh'], ['single', 'multiple']), ('lenovo', f'{ROOT_DIR}data/opseval/lenovo/splitted', ['zh'], ['single', 'multiple']), - ('pufa', f'{ROOT_DIR}data/opseval/pufa/splitted', ['zh'], ['single']), - ('rzy', f'{ROOT_DIR}data/opseval/rzy/splitted', ['zh'], ['single']), - ('zabbix', f'{ROOT_DIR}data/opseval/zabbix/splitted', ['zh'], ['single']), - ('zjyd', f'{ROOT_DIR}data/opseval/zjyd/', ['zh'], ['single']), + ('pufa', f'{ROOT_DIR}data/opseval/pufa/splitted_v2', ['zh'], ['single', 'multiple']), + ('rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', ['zh'], ['single', 'multiple']), + ('zabbix', f'{ROOT_DIR}data/opseval/zabbix/splitted_v2', ['zh'], ['single', 'multiple']), + ('zjyd', f'{ROOT_DIR}data/opseval/zjyd/splitted_v2', ['zh'], ['single', 'multiple']), +] + +company_qa_list = [ + ('bosc', f'{ROOT_DIR}data/opseval/bosc/splitted_v2', ['zh'], None), + ('rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', ['zh'], None), + ('zabbix', f'{ROOT_DIR}data/opseval/zabbix/splitted_v2', ['zh'], None), + ('zjyd', f'{ROOT_DIR}data/opseval/zjyd/splitted_v2', ['zh'], None), ] company_mc_ppl = sum([ @@ -82,16 +89,24 @@ def get_selected_datasets(setting_func, name, path, langs, qtypes): ], []) company_mc = company_mc_ppl+company_mc_gen +company_qa_ppl = sum([ + get_selected_datasets([get_qa_ppl_datasets], name, path, langs, qtypes) for name, path, langs, qtypes in company_qa_list +], []) +company_qa_gen = sum([ + get_selected_datasets([get_qa_gen_datasets], name, path, langs, qtypes) for name, path, langs, qtypes in company_qa_list +], []) +company_qa = company_qa_ppl+company_qa_gen + -rzy_qa_ppl = get_selected_datasets([get_qa_ppl_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted', langs=['zh'], qtypes=None) -rzy_qa_gen = get_selected_datasets([get_qa_gen_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted', langs=['zh'], qtypes=None) +rzy_qa_ppl = get_selected_datasets([get_qa_ppl_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', langs=['zh'], qtypes=None) +rzy_qa_gen = get_selected_datasets([get_qa_gen_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', langs=['zh'], qtypes=None) rzy_qa = rzy_qa_ppl + rzy_qa_gen all_ppl_mc = zte_mc_ppl + oracle_mc_ppl + owl_mc_ppl + network_mc_ppl + company_mc_ppl all_gen_mc = zte_mc_gen + oracle_mc_gen + owl_mc_gen + network_mc_gen + company_mc_gen -all_ppl_qa = owl_qa_ppl + rzy_qa_ppl -all_gen_qa = owl_qa_gen + rzy_qa_gen +all_ppl_qa = owl_qa_ppl + company_qa_ppl +all_gen_qa = owl_qa_gen + company_qa_gen ceval_mc_ppl = get_selected_datasets([get_mc_ppl_datasets], 'ceval', f'{ROOT_DIR}data/opseval/ceval/splitted_dev', langs=['zh'], qtypes=['single']) ceval_mc_gen = get_selected_datasets([get_mc_gen_datasets], 'ceval', f'{ROOT_DIR}data/opseval/ceval/splitted_dev', langs=['zh'], qtypes=['single']) diff --git a/configs/datasets/opseval/qa_gen.py b/configs/datasets/opseval/qa_gen.py index 36d03a7..d3edc01 100644 --- a/configs/datasets/opseval/qa_gen.py +++ b/configs/datasets/opseval/qa_gen.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever, ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer, SCInferencer, CoTInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator, BleuRougeEvaluator +from opencompass.openicl.icl_evaluator import AccEvaluator, BleuRougeEvaluator, OpsEvalGenQAEvaluator from opencompass.utils.text_postprocessors import first_capital_postprocess_multi from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer @@ -34,7 +34,8 @@ def get_qa_gen_datasets(dataset_name, path, langs=['zh'], qtypes=None): retriever=retriever_dict, inferencer=get_gen_inferencer(sc=False), ), - eval_cfg=dict(evaluator=dict(type=BleuRougeEvaluator)) + # eval_cfg=dict(evaluator=dict(type=BleuRougeEvaluator)) + eval_cfg=dict(evaluator=dict(type=OpsEvalGenQAEvaluator, language=lang)) ) for shot_abbr, shot_hint_id, retriever_dict in zip( ['Zero-shot', '3-shot'], diff --git a/configs/local_models/baichuan/baichuan.py b/configs/local_models/baichuan/baichuan.py index a474d05..e07e2a1 100644 --- a/configs/local_models/baichuan/baichuan.py +++ b/configs/local_models/baichuan/baichuan.py @@ -3,7 +3,7 @@ from mmengine import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model from ...api_key import baichuan_key _meta_template = dict( @@ -23,6 +23,16 @@ baichuan2_chats = [baichuan2_7b_chat, baichuan2_13b_chat] +baichuan2_7b_base_vllm = get_vllm_model(abbr="baichuan2-7b", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-7B-Base") +baichuan2_7b_chat_vllm = get_vllm_model(abbr="baichuan2-7b-chat", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-7B-Chat", meta_template=_meta_template) + +baichuan2_13b_base_vllm = get_vllm_model(abbr="baichuan2-13b", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-13B-Base") +baichuan2_13b_chat_vllm = get_vllm_model(abbr="baichuan2-13b-chat", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-13B-Chat", meta_template=_meta_template, num_gpus=1) + +baichuan2_bases_vllm = [baichuan2_7b_base_vllm, baichuan2_13b_base_vllm] +baichuan2_chats_vllm = [baichuan2_7b_chat_vllm, baichuan2_13b_chat_vllm] + + baichuan2_turbo = dict(abbr='Baichuan2-Turbo', type=BaichuanAPI, path='Baichuan2-Turbo', key=baichuan_key, max_out_len=100, max_seq_len=2048, batch_size=1) diff --git a/configs/local_models/google/gemma.py b/configs/local_models/google/gemma.py index dfc950b..d6c2dfc 100644 --- a/configs/local_models/google/gemma.py +++ b/configs/local_models/google/gemma.py @@ -3,6 +3,7 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR + from ..model_template import get_default_model, get_vllm_model gemma_2b = dict( type=HuggingFaceCausalLM, @@ -28,4 +29,9 @@ batch_size=8, model_kwargs=dict(device_map='auto', trust_remote_code=True), run_cfg=dict(num_gpus=1, num_procs=1), - ) \ No newline at end of file + ) + +gemma_2b_vllm = get_vllm_model(abbr="gemma-2b", path=ROOT_DIR+"models/google/gemma-2b") +gemma_7b_vllm = get_vllm_model(abbr="gemma-7b", path=ROOT_DIR+"models/google/gemma-7b") + +gemma_vllm = [gemma_2b_vllm, gemma_7b_vllm] \ No newline at end of file diff --git a/configs/local_models/internlm/internlm.py b/configs/local_models/internlm/internlm.py index 072395e..6764489 100644 --- a/configs/local_models/internlm/internlm.py +++ b/configs/local_models/internlm/internlm.py @@ -3,7 +3,7 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model internlm2_chat_7b = dict( type=HuggingFaceCausalLM, @@ -35,4 +35,13 @@ internlm2_20b = get_default_model(abbr="internlm2-20b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-20b", num_gpus=2) internlm2_bases = [internlm2_7b, internlm2_20b] -internlm2_chats = [internlm2_chat_20b, internlm2_chat_7b] \ No newline at end of file +internlm2_chats = [internlm2_chat_20b, internlm2_chat_7b] + +internlm2_7b_vllm = get_vllm_model(abbr="internlm2-7b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-7b") +internlm2_20b_vllm = get_vllm_model(abbr="internlm2-20b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-20b", num_gpus=4) + +internlm2_chat_7b_vllm = get_vllm_model(abbr="internlm2-chat-7b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-chat-7b") +internlm2_chat_20b_vllm = get_vllm_model(abbr="internlm2-chat-20b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-chat-20b", num_gpus=4) + +internlm2_bases_vllm = [internlm2_7b_vllm, internlm2_20b_vllm] +internlm2_chats_vllm = [internlm2_chat_20b_vllm, internlm2_chat_7b_vllm] \ No newline at end of file diff --git a/configs/local_models/lmsys/vicuna.py b/configs/local_models/lmsys/vicuna.py index 3818996..a8a7fc2 100644 --- a/configs/local_models/lmsys/vicuna.py +++ b/configs/local_models/lmsys/vicuna.py @@ -1,10 +1,15 @@ from mmengine.config import read_base with read_base(): - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model from ...paths import ROOT_DIR vicuna_7b_v1_5 = get_default_model(abbr="vicuna-7b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-7b-v1.5") vicuna_13b_v1_5 = get_default_model(abbr="vicuna-13b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-13b-v1.5") -vicuna_bases = [vicuna_7b_v1_5, vicuna_13b_v1_5] \ No newline at end of file +vicuna_bases = [vicuna_7b_v1_5, vicuna_13b_v1_5] + +vicuna_7b_v1_5_vllm = get_vllm_model(abbr="vicuna-7b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-7b-v1.5") +vicuna_13b_v1_5_vllm = get_vllm_model(abbr="vicuna-13b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-13b-v1.5") + +vicuna_bases_vllm = [vicuna_7b_v1_5_vllm, vicuna_13b_v1_5_vllm] \ No newline at end of file diff --git a/configs/local_models/mistral/mistral.py b/configs/local_models/mistral/mistral.py index 684daf0..0606400 100644 --- a/configs/local_models/mistral/mistral.py +++ b/configs/local_models/mistral/mistral.py @@ -1,6 +1,8 @@ from mmengine.config import read_base with read_base(): - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model from ...paths import ROOT_DIR -mistral_7b = get_default_model(abbr="mistral-7b", path=f"{ROOT_DIR}models/mistralai/Mistral-7B-v0.1") \ No newline at end of file +mistral_7b = get_default_model(abbr="mistral-7b", path=f"{ROOT_DIR}models/mistralai/Mistral-7B-v0.1") + +mistral_7b_vllm = get_vllm_model(abbr="mistral-7b", path=f"{ROOT_DIR}models/mistralai/Mistral-7B-v0.1") \ No newline at end of file diff --git a/configs/local_models/model_template.py b/configs/local_models/model_template.py index 82c7598..3c214b7 100644 --- a/configs/local_models/model_template.py +++ b/configs/local_models/model_template.py @@ -1,4 +1,5 @@ from opencompass.models.huggingface import HuggingFaceCausalLM +from opencompass.models import VLLM def get_default_model(abbr, path, num_gpus=1, meta_template=None): @@ -17,4 +18,21 @@ def get_default_model(abbr, path, num_gpus=1, meta_template=None): batch_size=8, model_kwargs=dict(device_map='auto', trust_remote_code=True), run_cfg=dict(num_gpus=num_gpus, num_procs=1), + ) + +def get_vllm_model(abbr, path, num_gpus=1, meta_template=None, generation_kwargs=dict()): + return dict( + type=VLLM, + abbr=abbr, + path=path, + max_seq_len=2048, + model_kwargs=dict(trust_remote_code=True, max_model_len=2000, tensor_parallel_size=num_gpus, gpu_memory_utilization=0.95), + generation_kwargs=generation_kwargs, + meta_template=meta_template, + max_out_len=400, + mode='none', + batch_size=1, + use_fastchat_template=False, + run_cfg=dict(num_gpus=num_gpus, num_procs=1), + end_str=None, ) \ No newline at end of file diff --git a/configs/local_models/qwen/qwen.py b/configs/local_models/qwen/qwen.py index a64a7db..2b364d3 100644 --- a/configs/local_models/qwen/qwen.py +++ b/configs/local_models/qwen/qwen.py @@ -2,7 +2,7 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model qwen_meta_template = dict( round=[ @@ -126,4 +126,21 @@ batch_size=8, model_kwargs=dict(device_map='auto', trust_remote_code=True), run_cfg=dict(num_gpus=1, num_procs=1), - ) \ No newline at end of file + ) + +qwen1_5_base_vllm_models = [ + get_vllm_model( + abbr=f"qwen1.5-{quant}b-base", + path=f"{ROOT_DIR}models/Qwen/Qwen1.5-{quant}B", + num_gpus=1 if "72" not in quant else 4) + for quant in ["0.5", "1.8", "4", "7", "14", "72"] +] + +qwen1_5_chat_vllm_models = [ + get_vllm_model( + abbr=f"qwen1.5-{quant}b-chat", + meta_template=qwen_meta_template, + num_gpus=1 if "72" not in quant else 4, + path=f"{ROOT_DIR}models/Qwen/Qwen1.5-{quant}B-Chat") + for quant in ["0.5", "1.8", "4", "7", "14", "72"] +] \ No newline at end of file diff --git a/configs/local_models/qwen/qwen_vllm.py b/configs/local_models/qwen/qwen_vllm.py deleted file mode 100644 index 950be76..0000000 --- a/configs/local_models/qwen/qwen_vllm.py +++ /dev/null @@ -1,15 +0,0 @@ -from opencompass.models import VLLM - -qwen_1_5b_14b_chat_vllm = dict( - type=VLLM, - abbr='qwen-1.5b-14b-chat', - path="/home/junetheriver/models/qwen/Qwen1.5-14B-Chat", - max_seq_len=2048, - model_kwargs=dict(trust_remote_code=True, max_model_len=2048), - generation_kwargs=dict(), - meta_template=None, - mode='none', - batch_size=1, - use_fastchat_template=False, - end_str=None, -) \ No newline at end of file diff --git a/configs/local_models/yi/yi.py b/configs/local_models/yi/yi.py index 8a36f4e..c2094f8 100644 --- a/configs/local_models/yi/yi.py +++ b/configs/local_models/yi/yi.py @@ -1,17 +1,42 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model +yi_meta_template = dict( + round=[ + dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>'), + dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>', generate=True), + ], +) yi_6b = get_default_model(abbr="yi-6b", path=f"{ROOT_DIR}models/01-ai/Yi-6B") yi_9b = get_default_model(abbr="yi-9b", path=f"{ROOT_DIR}models/01-ai/Yi-9B") -yi_34b = get_default_model(abbr="yi-34b", path=f"{ROOT_DIR}models/01-ai/Yi-34B") +yi_34b = get_default_model(abbr="yi-34b", path=f"{ROOT_DIR}models/01-ai/Yi-34B", num_gpus=2) -yi_6b_chat = get_default_model(abbr="yi-6b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-6B-Chat") -yi_9b_chat = get_default_model(abbr="yi-9b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-9B-Chat") -yi_34b_chat = get_default_model(abbr="yi-34b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-34B-Chat") +yi_6b_chat = get_default_model(abbr="yi-6b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-6B-Chat", meta_template=yi_meta_template) +# yi_9b_chat = get_default_model(abbr="yi-9b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-9B-Chat", meta_template=yi_meta_template) +yi_34b_chat = get_default_model(abbr="yi-34b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-34B-Chat", meta_template=yi_meta_template, num_gpus=4) yi_bases = [yi_6b, yi_9b, yi_34b] -yi_chats = [yi_6b_chat, yi_9b_chat, yi_34b_chat] -yi_all = yi_bases + yi_chats \ No newline at end of file +yi_chats = [yi_6b_chat, + # yi_9b_chat, + yi_34b_chat] +yi_all = yi_bases + yi_chats + + + + +yi_6b_vllm = get_vllm_model(abbr="yi-6b", path=f"{ROOT_DIR}models/01-ai/Yi-6B") +yi_9b_vllm = get_vllm_model(abbr="yi-9b", path=f"{ROOT_DIR}models/01-ai/Yi-9B") +yi_34b_vllm = get_vllm_model(abbr="yi-34b", path=f"{ROOT_DIR}models/01-ai/Yi-34B", num_gpus=2) + +yi_6b_chat_vllm = get_vllm_model(abbr="yi-6b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-6B-Chat", meta_template=yi_meta_template) +# yi_9b_chat_vllm = get_vllm_model(abbr="yi-9b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-9B-Chat", meta_template=yi_meta_template) +yi_34b_chat_vllm = get_vllm_model(abbr="yi-34b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-34B-Chat", meta_template=yi_meta_template, num_gpus=4) + +yi_bases_vllm = [yi_6b_vllm, yi_9b_vllm, yi_34b_vllm] +yi_chats_vllm = [yi_6b_chat_vllm, + # yi_9b_chat_vllm, + yi_34b_chat_vllm] +yi_all_vllm = yi_bases_vllm + yi_chats_vllm \ No newline at end of file diff --git a/configs/lyh/vllm_qa.py b/configs/lyh/vllm_qa.py new file mode 100644 index 0000000..f329e3e --- /dev/null +++ b/configs/lyh/vllm_qa.py @@ -0,0 +1,69 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + *peiqi_models, + *baichuan2_chats_vllm, + *gemma_vllm, + *internlm2_chats_vllm, + *vicuna_bases_vllm, + *yi_all_vllm, + mistral_7b_vllm, + *qwen1_5_chat_vllm_models, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/configs/lyh/vllm_qa_1.py b/configs/lyh/vllm_qa_1.py new file mode 100644 index 0000000..695c5c8 --- /dev/null +++ b/configs/lyh/vllm_qa_1.py @@ -0,0 +1,67 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + *peiqi_models, + *baichuan2_chats_vllm, + *gemma_vllm, + *internlm2_chats_vllm, + *vicuna_bases_vllm, + *yi_all_vllm, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/configs/lyh/vllm_qa_2.py b/configs/lyh/vllm_qa_2.py new file mode 100644 index 0000000..2015c12 --- /dev/null +++ b/configs/lyh/vllm_qa_2.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + mistral_7b_vllm, + #*yi_all_vllm, + *qwen1_5_chat_vllm_models, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/configs/tests/test_model_datasets.py b/configs/tests/test_model_datasets.py new file mode 100644 index 0000000..61a839a --- /dev/null +++ b/configs/tests/test_model_datasets.py @@ -0,0 +1,69 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + *peiqi_models, + *baichuan2_chats_vllm, + *gemma_vllm, + *internlm2_chats_vllm, + *vicuna_bases_vllm, + mistral_7b_vllm, + *yi_all_vllm, + *qwen1_5_chat_vllm_models, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py index 5f87132..7ff0ca3 100644 --- a/opencompass/openicl/icl_evaluator/__init__.py +++ b/opencompass/openicl/icl_evaluator/__init__.py @@ -10,5 +10,5 @@ from .icl_plugin_evaluator import TEvalEvaluator # noqa from .icl_toxic_evaluator import ToxicEvaluator # noqa from .icl_stat_evaluator import * # noqa -from .opseval_gen_evaluator import OpsEvalGenMCEvaluator # noqa +from .opseval_gen_evaluator import OpsEvalGenMCEvaluator, OpsEvalGenQAEvaluator # noqa from .lm_evaluator import LMEvaluator # noqa diff --git a/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py b/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py index 22feb34..10df4b0 100644 --- a/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py +++ b/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py @@ -5,6 +5,9 @@ from .icl_base_evaluator import BaseEvaluator import re from collections import Counter +import jieba +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction def extract_answer(text: str) -> str: """ @@ -115,4 +118,41 @@ def score(self, predictions: List, references: List) -> dict: return { 'Accuracy': correct / tot * 100, 'SC-Accuracy': sc_correct / tot * 100, - } \ No newline at end of file + } + +class OpsEvalGenQAEvaluator(BaseEvaluator): + + def __init__(self, language='en'): + super().__init__() + self.language = language + + def score(self, predictions: List, references: List) -> dict: + tot_bleu, tot_rouge = 0, 0 + for pred, ans in zip(predictions, references): + bleu_score, rouge_score = self.get_rouge_bleu(pred, ans, self.language) + tot_bleu += bleu_score + tot_rouge += rouge_score + return { + "bleu": tot_bleu / len(predictions), + "rouge": tot_rouge / len(predictions) + } + + def get_rouge_bleu(self, pred, answer, language='en'): + rouge = Rouge() + smoothie = SmoothingFunction().method7 + def clean_word(words): + punctuations = """!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~、。,·ˉˇ¨〃‘’“”々〆〇〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.""" + return [word for word in words if word.strip() and word not in punctuations] + try: + if language == 'en': + bleu_score = sentence_bleu([clean_word(answer.split())], clean_word(pred.split()), smoothing_function=smoothie) + rouge_score = rouge.get_scores(' '.join(clean_word(pred.split())), ' '.join(clean_word(answer.split())), avg=True)['rouge-l']['f'] + else: + answer_tokenized = clean_word(list(jieba.cut(answer))) + pred_tokenized = clean_word(list(jieba.cut(pred))) + bleu_score = sentence_bleu([answer_tokenized], pred_tokenized, smoothing_function=smoothie) + rouge_score = rouge.get_scores(' '.join(pred_tokenized), ' '.join(answer_tokenized), avg=True)['rouge-l']['f'] + except Exception as err: + print(f"[WARNING] Error when calculating bleu and rouge: {err}") + bleu_score, rouge_score = 0.0, 0.0 + return bleu_score, rouge_score \ No newline at end of file diff --git a/opencompass/openicl/icl_retriever/icl_base_retriever.py b/opencompass/openicl/icl_retriever/icl_base_retriever.py index 2351c7e..d5b5bcd 100644 --- a/opencompass/openicl/icl_retriever/icl_base_retriever.py +++ b/opencompass/openicl/icl_retriever/icl_base_retriever.py @@ -164,7 +164,8 @@ def get_label_by_idx(self, idx): self.dataset_reader.output_column: self.test_ds[self.dataset_reader.output_column][idx], 'id': - self.test_ds['id'][idx] if 'id' in self.test_ds.features else None + self.test_ds['id'][idx] if 'id' in self.test_ds.features else None, + 'question': self.test_ds['question'][idx] if 'question' in self.test_ds.features else None, } def generate_prompt_for_ppl_task(