diff --git a/configs/datasets/opseval/datasets.py b/configs/datasets/opseval/datasets.py index e6895e4..89a9b12 100644 --- a/configs/datasets/opseval/datasets.py +++ b/configs/datasets/opseval/datasets.py @@ -63,15 +63,22 @@ def get_selected_datasets(setting_func, name, path, langs, qtypes): network_mc = network_mc_ppl + network_mc_gen company_mc_list = [ - ('bosc', f'{ROOT_DIR}data/opseval/bosc/splitted', ['zh'], ['single']), + ('bosc', f'{ROOT_DIR}data/opseval/bosc/splitted_v2', ['zh'], ['single', 'multiple']), # ('dfcdata', f'{ROOT_DIR}data/opseval/dfcdata/splitted', ['zh'], ['single']), - ('gtja', f'{ROOT_DIR}data/opseval/gtja/splitted', ['zh'], ['single']), + ('gtja', f'{ROOT_DIR}data/opseval/gtja/splitted_v2', ['zh'], ['single', 'multiple']), ('huaweicloud', f'{ROOT_DIR}data/opseval/huaweicloud/splitted', ['zh'], ['single', 'multiple']), ('lenovo', f'{ROOT_DIR}data/opseval/lenovo/splitted', ['zh'], ['single', 'multiple']), - ('pufa', f'{ROOT_DIR}data/opseval/pufa/splitted', ['zh'], ['single']), - ('rzy', f'{ROOT_DIR}data/opseval/rzy/splitted', ['zh'], ['single']), - ('zabbix', f'{ROOT_DIR}data/opseval/zabbix/splitted', ['zh'], ['single']), - ('zjyd', f'{ROOT_DIR}data/opseval/zjyd/', ['zh'], ['single']), + ('pufa', f'{ROOT_DIR}data/opseval/pufa/splitted_v2', ['zh'], ['single', 'multiple']), + ('rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', ['zh'], ['single', 'multiple']), + ('zabbix', f'{ROOT_DIR}data/opseval/zabbix/splitted_v2', ['zh'], ['single', 'multiple']), + ('zjyd', f'{ROOT_DIR}data/opseval/zjyd/splitted_v2', ['zh'], ['single', 'multiple']), +] + +company_qa_list = [ + ('bosc', f'{ROOT_DIR}data/opseval/bosc/splitted_v2', ['zh'], None), + ('rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', ['zh'], None), + ('zabbix', f'{ROOT_DIR}data/opseval/zabbix/splitted_v2', ['zh'], None), + ('zjyd', f'{ROOT_DIR}data/opseval/zjyd/splitted_v2', ['zh'], None), ] company_mc_ppl = sum([ @@ -82,16 +89,24 @@ def get_selected_datasets(setting_func, name, path, langs, qtypes): ], []) company_mc = company_mc_ppl+company_mc_gen +company_qa_ppl = sum([ + get_selected_datasets([get_qa_ppl_datasets], name, path, langs, qtypes) for name, path, langs, qtypes in company_qa_list +], []) +company_qa_gen = sum([ + get_selected_datasets([get_qa_gen_datasets], name, path, langs, qtypes) for name, path, langs, qtypes in company_qa_list +], []) +company_qa = company_qa_ppl+company_qa_gen + -rzy_qa_ppl = get_selected_datasets([get_qa_ppl_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted', langs=['zh'], qtypes=None) -rzy_qa_gen = get_selected_datasets([get_qa_gen_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted', langs=['zh'], qtypes=None) +rzy_qa_ppl = get_selected_datasets([get_qa_ppl_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', langs=['zh'], qtypes=None) +rzy_qa_gen = get_selected_datasets([get_qa_gen_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', langs=['zh'], qtypes=None) rzy_qa = rzy_qa_ppl + rzy_qa_gen all_ppl_mc = zte_mc_ppl + oracle_mc_ppl + owl_mc_ppl + network_mc_ppl + company_mc_ppl all_gen_mc = zte_mc_gen + oracle_mc_gen + owl_mc_gen + network_mc_gen + company_mc_gen -all_ppl_qa = owl_qa_ppl + rzy_qa_ppl -all_gen_qa = owl_qa_gen + rzy_qa_gen +all_ppl_qa = owl_qa_ppl + company_qa_ppl +all_gen_qa = owl_qa_gen + company_qa_gen ceval_mc_ppl = get_selected_datasets([get_mc_ppl_datasets], 'ceval', f'{ROOT_DIR}data/opseval/ceval/splitted_dev', langs=['zh'], qtypes=['single']) ceval_mc_gen = get_selected_datasets([get_mc_gen_datasets], 'ceval', f'{ROOT_DIR}data/opseval/ceval/splitted_dev', langs=['zh'], qtypes=['single']) diff --git a/configs/datasets/opseval/qa_gen.py b/configs/datasets/opseval/qa_gen.py index 36d03a7..d3edc01 100644 --- a/configs/datasets/opseval/qa_gen.py +++ b/configs/datasets/opseval/qa_gen.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import FixKRetriever, ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer, SCInferencer, CoTInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator, BleuRougeEvaluator +from opencompass.openicl.icl_evaluator import AccEvaluator, BleuRougeEvaluator, OpsEvalGenQAEvaluator from opencompass.utils.text_postprocessors import first_capital_postprocess_multi from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer @@ -34,7 +34,8 @@ def get_qa_gen_datasets(dataset_name, path, langs=['zh'], qtypes=None): retriever=retriever_dict, inferencer=get_gen_inferencer(sc=False), ), - eval_cfg=dict(evaluator=dict(type=BleuRougeEvaluator)) + # eval_cfg=dict(evaluator=dict(type=BleuRougeEvaluator)) + eval_cfg=dict(evaluator=dict(type=OpsEvalGenQAEvaluator, language=lang)) ) for shot_abbr, shot_hint_id, retriever_dict in zip( ['Zero-shot', '3-shot'], diff --git a/configs/local_models/baichuan/baichuan.py b/configs/local_models/baichuan/baichuan.py index a474d05..e07e2a1 100644 --- a/configs/local_models/baichuan/baichuan.py +++ b/configs/local_models/baichuan/baichuan.py @@ -3,7 +3,7 @@ from mmengine import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model from ...api_key import baichuan_key _meta_template = dict( @@ -23,6 +23,16 @@ baichuan2_chats = [baichuan2_7b_chat, baichuan2_13b_chat] +baichuan2_7b_base_vllm = get_vllm_model(abbr="baichuan2-7b", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-7B-Base") +baichuan2_7b_chat_vllm = get_vllm_model(abbr="baichuan2-7b-chat", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-7B-Chat", meta_template=_meta_template) + +baichuan2_13b_base_vllm = get_vllm_model(abbr="baichuan2-13b", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-13B-Base") +baichuan2_13b_chat_vllm = get_vllm_model(abbr="baichuan2-13b-chat", path=f"{ROOT_DIR}models/baichuan-inc/Baichuan2-13B-Chat", meta_template=_meta_template, num_gpus=1) + +baichuan2_bases_vllm = [baichuan2_7b_base_vllm, baichuan2_13b_base_vllm] +baichuan2_chats_vllm = [baichuan2_7b_chat_vllm, baichuan2_13b_chat_vllm] + + baichuan2_turbo = dict(abbr='Baichuan2-Turbo', type=BaichuanAPI, path='Baichuan2-Turbo', key=baichuan_key, max_out_len=100, max_seq_len=2048, batch_size=1) diff --git a/configs/local_models/google/gemma.py b/configs/local_models/google/gemma.py index dfc950b..d6c2dfc 100644 --- a/configs/local_models/google/gemma.py +++ b/configs/local_models/google/gemma.py @@ -3,6 +3,7 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR + from ..model_template import get_default_model, get_vllm_model gemma_2b = dict( type=HuggingFaceCausalLM, @@ -28,4 +29,9 @@ batch_size=8, model_kwargs=dict(device_map='auto', trust_remote_code=True), run_cfg=dict(num_gpus=1, num_procs=1), - ) \ No newline at end of file + ) + +gemma_2b_vllm = get_vllm_model(abbr="gemma-2b", path=ROOT_DIR+"models/google/gemma-2b") +gemma_7b_vllm = get_vllm_model(abbr="gemma-7b", path=ROOT_DIR+"models/google/gemma-7b") + +gemma_vllm = [gemma_2b_vllm, gemma_7b_vllm] \ No newline at end of file diff --git a/configs/local_models/internlm/internlm.py b/configs/local_models/internlm/internlm.py index 072395e..6764489 100644 --- a/configs/local_models/internlm/internlm.py +++ b/configs/local_models/internlm/internlm.py @@ -3,7 +3,7 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model internlm2_chat_7b = dict( type=HuggingFaceCausalLM, @@ -35,4 +35,13 @@ internlm2_20b = get_default_model(abbr="internlm2-20b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-20b", num_gpus=2) internlm2_bases = [internlm2_7b, internlm2_20b] -internlm2_chats = [internlm2_chat_20b, internlm2_chat_7b] \ No newline at end of file +internlm2_chats = [internlm2_chat_20b, internlm2_chat_7b] + +internlm2_7b_vllm = get_vllm_model(abbr="internlm2-7b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-7b") +internlm2_20b_vllm = get_vllm_model(abbr="internlm2-20b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-20b", num_gpus=4) + +internlm2_chat_7b_vllm = get_vllm_model(abbr="internlm2-chat-7b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-chat-7b") +internlm2_chat_20b_vllm = get_vllm_model(abbr="internlm2-chat-20b", path=ROOT_DIR+"models/Shanghai_AI_Laboratory/internlm2-chat-20b", num_gpus=4) + +internlm2_bases_vllm = [internlm2_7b_vllm, internlm2_20b_vllm] +internlm2_chats_vllm = [internlm2_chat_20b_vllm, internlm2_chat_7b_vllm] \ No newline at end of file diff --git a/configs/local_models/lmsys/vicuna.py b/configs/local_models/lmsys/vicuna.py index 3818996..a8a7fc2 100644 --- a/configs/local_models/lmsys/vicuna.py +++ b/configs/local_models/lmsys/vicuna.py @@ -1,10 +1,15 @@ from mmengine.config import read_base with read_base(): - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model from ...paths import ROOT_DIR vicuna_7b_v1_5 = get_default_model(abbr="vicuna-7b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-7b-v1.5") vicuna_13b_v1_5 = get_default_model(abbr="vicuna-13b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-13b-v1.5") -vicuna_bases = [vicuna_7b_v1_5, vicuna_13b_v1_5] \ No newline at end of file +vicuna_bases = [vicuna_7b_v1_5, vicuna_13b_v1_5] + +vicuna_7b_v1_5_vllm = get_vllm_model(abbr="vicuna-7b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-7b-v1.5") +vicuna_13b_v1_5_vllm = get_vllm_model(abbr="vicuna-13b-v1.5", path=f"{ROOT_DIR}models/lmsys/vicuna-13b-v1.5") + +vicuna_bases_vllm = [vicuna_7b_v1_5_vllm, vicuna_13b_v1_5_vllm] \ No newline at end of file diff --git a/configs/local_models/mistral/mistral.py b/configs/local_models/mistral/mistral.py index 684daf0..0606400 100644 --- a/configs/local_models/mistral/mistral.py +++ b/configs/local_models/mistral/mistral.py @@ -1,6 +1,8 @@ from mmengine.config import read_base with read_base(): - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model from ...paths import ROOT_DIR -mistral_7b = get_default_model(abbr="mistral-7b", path=f"{ROOT_DIR}models/mistralai/Mistral-7B-v0.1") \ No newline at end of file +mistral_7b = get_default_model(abbr="mistral-7b", path=f"{ROOT_DIR}models/mistralai/Mistral-7B-v0.1") + +mistral_7b_vllm = get_vllm_model(abbr="mistral-7b", path=f"{ROOT_DIR}models/mistralai/Mistral-7B-v0.1") \ No newline at end of file diff --git a/configs/local_models/model_template.py b/configs/local_models/model_template.py index 82c7598..3c214b7 100644 --- a/configs/local_models/model_template.py +++ b/configs/local_models/model_template.py @@ -1,4 +1,5 @@ from opencompass.models.huggingface import HuggingFaceCausalLM +from opencompass.models import VLLM def get_default_model(abbr, path, num_gpus=1, meta_template=None): @@ -17,4 +18,21 @@ def get_default_model(abbr, path, num_gpus=1, meta_template=None): batch_size=8, model_kwargs=dict(device_map='auto', trust_remote_code=True), run_cfg=dict(num_gpus=num_gpus, num_procs=1), + ) + +def get_vllm_model(abbr, path, num_gpus=1, meta_template=None, generation_kwargs=dict()): + return dict( + type=VLLM, + abbr=abbr, + path=path, + max_seq_len=2048, + model_kwargs=dict(trust_remote_code=True, max_model_len=2000, tensor_parallel_size=num_gpus, gpu_memory_utilization=0.95), + generation_kwargs=generation_kwargs, + meta_template=meta_template, + max_out_len=400, + mode='none', + batch_size=1, + use_fastchat_template=False, + run_cfg=dict(num_gpus=num_gpus, num_procs=1), + end_str=None, ) \ No newline at end of file diff --git a/configs/local_models/qwen/qwen.py b/configs/local_models/qwen/qwen.py index a64a7db..2b364d3 100644 --- a/configs/local_models/qwen/qwen.py +++ b/configs/local_models/qwen/qwen.py @@ -2,7 +2,7 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model qwen_meta_template = dict( round=[ @@ -126,4 +126,21 @@ batch_size=8, model_kwargs=dict(device_map='auto', trust_remote_code=True), run_cfg=dict(num_gpus=1, num_procs=1), - ) \ No newline at end of file + ) + +qwen1_5_base_vllm_models = [ + get_vllm_model( + abbr=f"qwen1.5-{quant}b-base", + path=f"{ROOT_DIR}models/Qwen/Qwen1.5-{quant}B", + num_gpus=1 if "72" not in quant else 4) + for quant in ["0.5", "1.8", "4", "7", "14", "72"] +] + +qwen1_5_chat_vllm_models = [ + get_vllm_model( + abbr=f"qwen1.5-{quant}b-chat", + meta_template=qwen_meta_template, + num_gpus=1 if "72" not in quant else 4, + path=f"{ROOT_DIR}models/Qwen/Qwen1.5-{quant}B-Chat") + for quant in ["0.5", "1.8", "4", "7", "14", "72"] +] \ No newline at end of file diff --git a/configs/local_models/qwen/qwen_vllm.py b/configs/local_models/qwen/qwen_vllm.py deleted file mode 100644 index 950be76..0000000 --- a/configs/local_models/qwen/qwen_vllm.py +++ /dev/null @@ -1,15 +0,0 @@ -from opencompass.models import VLLM - -qwen_1_5b_14b_chat_vllm = dict( - type=VLLM, - abbr='qwen-1.5b-14b-chat', - path="/home/junetheriver/models/qwen/Qwen1.5-14B-Chat", - max_seq_len=2048, - model_kwargs=dict(trust_remote_code=True, max_model_len=2048), - generation_kwargs=dict(), - meta_template=None, - mode='none', - batch_size=1, - use_fastchat_template=False, - end_str=None, -) \ No newline at end of file diff --git a/configs/local_models/yi/yi.py b/configs/local_models/yi/yi.py index 8a36f4e..c2094f8 100644 --- a/configs/local_models/yi/yi.py +++ b/configs/local_models/yi/yi.py @@ -1,17 +1,42 @@ from mmengine.config import read_base with read_base(): from ...paths import ROOT_DIR - from ..model_template import get_default_model + from ..model_template import get_default_model, get_vllm_model +yi_meta_template = dict( + round=[ + dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>'), + dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>', generate=True), + ], +) yi_6b = get_default_model(abbr="yi-6b", path=f"{ROOT_DIR}models/01-ai/Yi-6B") yi_9b = get_default_model(abbr="yi-9b", path=f"{ROOT_DIR}models/01-ai/Yi-9B") -yi_34b = get_default_model(abbr="yi-34b", path=f"{ROOT_DIR}models/01-ai/Yi-34B") +yi_34b = get_default_model(abbr="yi-34b", path=f"{ROOT_DIR}models/01-ai/Yi-34B", num_gpus=2) -yi_6b_chat = get_default_model(abbr="yi-6b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-6B-Chat") -yi_9b_chat = get_default_model(abbr="yi-9b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-9B-Chat") -yi_34b_chat = get_default_model(abbr="yi-34b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-34B-Chat") +yi_6b_chat = get_default_model(abbr="yi-6b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-6B-Chat", meta_template=yi_meta_template) +# yi_9b_chat = get_default_model(abbr="yi-9b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-9B-Chat", meta_template=yi_meta_template) +yi_34b_chat = get_default_model(abbr="yi-34b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-34B-Chat", meta_template=yi_meta_template, num_gpus=4) yi_bases = [yi_6b, yi_9b, yi_34b] -yi_chats = [yi_6b_chat, yi_9b_chat, yi_34b_chat] -yi_all = yi_bases + yi_chats \ No newline at end of file +yi_chats = [yi_6b_chat, + # yi_9b_chat, + yi_34b_chat] +yi_all = yi_bases + yi_chats + + + + +yi_6b_vllm = get_vllm_model(abbr="yi-6b", path=f"{ROOT_DIR}models/01-ai/Yi-6B") +yi_9b_vllm = get_vllm_model(abbr="yi-9b", path=f"{ROOT_DIR}models/01-ai/Yi-9B") +yi_34b_vllm = get_vllm_model(abbr="yi-34b", path=f"{ROOT_DIR}models/01-ai/Yi-34B", num_gpus=2) + +yi_6b_chat_vllm = get_vllm_model(abbr="yi-6b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-6B-Chat", meta_template=yi_meta_template) +# yi_9b_chat_vllm = get_vllm_model(abbr="yi-9b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-9B-Chat", meta_template=yi_meta_template) +yi_34b_chat_vllm = get_vllm_model(abbr="yi-34b-chat", path=f"{ROOT_DIR}models/01-ai/Yi-34B-Chat", meta_template=yi_meta_template, num_gpus=4) + +yi_bases_vllm = [yi_6b_vllm, yi_9b_vllm, yi_34b_vllm] +yi_chats_vllm = [yi_6b_chat_vllm, + # yi_9b_chat_vllm, + yi_34b_chat_vllm] +yi_all_vllm = yi_bases_vllm + yi_chats_vllm \ No newline at end of file diff --git a/configs/lyh/vllm_qa.py b/configs/lyh/vllm_qa.py new file mode 100644 index 0000000..f329e3e --- /dev/null +++ b/configs/lyh/vllm_qa.py @@ -0,0 +1,69 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + *peiqi_models, + *baichuan2_chats_vllm, + *gemma_vllm, + *internlm2_chats_vllm, + *vicuna_bases_vllm, + *yi_all_vllm, + mistral_7b_vllm, + *qwen1_5_chat_vllm_models, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/configs/lyh/vllm_qa_1.py b/configs/lyh/vllm_qa_1.py new file mode 100644 index 0000000..695c5c8 --- /dev/null +++ b/configs/lyh/vllm_qa_1.py @@ -0,0 +1,67 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + *peiqi_models, + *baichuan2_chats_vllm, + *gemma_vllm, + *internlm2_chats_vllm, + *vicuna_bases_vllm, + *yi_all_vllm, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/configs/lyh/vllm_qa_2.py b/configs/lyh/vllm_qa_2.py new file mode 100644 index 0000000..2015c12 --- /dev/null +++ b/configs/lyh/vllm_qa_2.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + mistral_7b_vllm, + #*yi_all_vllm, + *qwen1_5_chat_vllm_models, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/configs/tests/test_model_datasets.py b/configs/tests/test_model_datasets.py new file mode 100644 index 0000000..61a839a --- /dev/null +++ b/configs/tests/test_model_datasets.py @@ -0,0 +1,69 @@ +from mmengine.config import read_base +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +with read_base(): + # Datasets + from ..datasets.opseval.datasets import all_gen_qa + # Models + from ..local_models.baichuan.baichuan import baichuan2_chats_vllm + from ..local_models.google.gemma import gemma_vllm + from ..local_models.internlm.internlm import internlm2_chats_vllm + from ..local_models.lmsys.vicuna import vicuna_bases_vllm + from ..local_models.mistral.mistral import mistral_7b_vllm + from ..local_models.qwen.qwen import qwen1_5_chat_vllm_models + from ..local_models.yi.yi import yi_all_vllm + from ..models.gpt_3dot5_turbo_peiqi import models as peiqi_models + + from ..paths import ROOT_DIR + + +datasets = [ + *all_gen_qa +] + +# datasets = [datasets[0]] + +models = [ + *peiqi_models, + *baichuan2_chats_vllm, + *gemma_vllm, + *internlm2_chats_vllm, + *vicuna_bases_vllm, + mistral_7b_vllm, + *yi_all_vllm, + *qwen1_5_chat_vllm_models, +] + +for dataset in datasets: + dataset['sample_setting'] = dict() + dataset['infer_cfg']['inferencer']['save_every'] = 8 + dataset['infer_cfg']['inferencer']['sc_size'] = 1 + dataset['infer_cfg']['inferencer']['max_token_len'] = 200 + dataset['eval_cfg']['sc_size'] = 1 + # dataset['sample_setting'] = dict(sample_size=2) # !!!WARNING: Use for testing only!!! + + +infer = dict( + partitioner=dict( + # type=SizePartitioner, + # max_task_size=100, + # gen_task_coef=1, + type=NaivePartitioner + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + max_workers_per_gpu=1, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + max_num_workers=32, + task=dict(type=OpenICLEvalTask)), +) diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py index 5f87132..7ff0ca3 100644 --- a/opencompass/openicl/icl_evaluator/__init__.py +++ b/opencompass/openicl/icl_evaluator/__init__.py @@ -10,5 +10,5 @@ from .icl_plugin_evaluator import TEvalEvaluator # noqa from .icl_toxic_evaluator import ToxicEvaluator # noqa from .icl_stat_evaluator import * # noqa -from .opseval_gen_evaluator import OpsEvalGenMCEvaluator # noqa +from .opseval_gen_evaluator import OpsEvalGenMCEvaluator, OpsEvalGenQAEvaluator # noqa from .lm_evaluator import LMEvaluator # noqa diff --git a/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py b/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py index 22feb34..10df4b0 100644 --- a/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py +++ b/opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py @@ -5,6 +5,9 @@ from .icl_base_evaluator import BaseEvaluator import re from collections import Counter +import jieba +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction def extract_answer(text: str) -> str: """ @@ -115,4 +118,41 @@ def score(self, predictions: List, references: List) -> dict: return { 'Accuracy': correct / tot * 100, 'SC-Accuracy': sc_correct / tot * 100, - } \ No newline at end of file + } + +class OpsEvalGenQAEvaluator(BaseEvaluator): + + def __init__(self, language='en'): + super().__init__() + self.language = language + + def score(self, predictions: List, references: List) -> dict: + tot_bleu, tot_rouge = 0, 0 + for pred, ans in zip(predictions, references): + bleu_score, rouge_score = self.get_rouge_bleu(pred, ans, self.language) + tot_bleu += bleu_score + tot_rouge += rouge_score + return { + "bleu": tot_bleu / len(predictions), + "rouge": tot_rouge / len(predictions) + } + + def get_rouge_bleu(self, pred, answer, language='en'): + rouge = Rouge() + smoothie = SmoothingFunction().method7 + def clean_word(words): + punctuations = """!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~、。,·ˉˇ¨〃‘’“”々〆〇〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.""" + return [word for word in words if word.strip() and word not in punctuations] + try: + if language == 'en': + bleu_score = sentence_bleu([clean_word(answer.split())], clean_word(pred.split()), smoothing_function=smoothie) + rouge_score = rouge.get_scores(' '.join(clean_word(pred.split())), ' '.join(clean_word(answer.split())), avg=True)['rouge-l']['f'] + else: + answer_tokenized = clean_word(list(jieba.cut(answer))) + pred_tokenized = clean_word(list(jieba.cut(pred))) + bleu_score = sentence_bleu([answer_tokenized], pred_tokenized, smoothing_function=smoothie) + rouge_score = rouge.get_scores(' '.join(pred_tokenized), ' '.join(answer_tokenized), avg=True)['rouge-l']['f'] + except Exception as err: + print(f"[WARNING] Error when calculating bleu and rouge: {err}") + bleu_score, rouge_score = 0.0, 0.0 + return bleu_score, rouge_score \ No newline at end of file diff --git a/opencompass/openicl/icl_retriever/icl_base_retriever.py b/opencompass/openicl/icl_retriever/icl_base_retriever.py index 2351c7e..d5b5bcd 100644 --- a/opencompass/openicl/icl_retriever/icl_base_retriever.py +++ b/opencompass/openicl/icl_retriever/icl_base_retriever.py @@ -164,7 +164,8 @@ def get_label_by_idx(self, idx): self.dataset_reader.output_column: self.test_ds[self.dataset_reader.output_column][idx], 'id': - self.test_ds['id'][idx] if 'id' in self.test_ds.features else None + self.test_ds['id'][idx] if 'id' in self.test_ds.features else None, + 'question': self.test_ds['question'][idx] if 'question' in self.test_ds.features else None, } def generate_prompt_for_ppl_task(