Skip to content

Commit

Permalink
zte ragas and longchat dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
NickLennonLiu committed Apr 15, 2024
1 parent 906e3bb commit ec877a5
Show file tree
Hide file tree
Showing 13 changed files with 323 additions and 43 deletions.
13 changes: 9 additions & 4 deletions configs/datasets/opseval/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
with read_base():
from .mc_gen import get_mc_gen_datasets
from .mc_ppl import get_mc_ppl_datasets
from .qa_gen import get_qa_gen_datasets
from .qa_ppl import get_qa_ppl_datasets
from .qa_gen import get_qa_gen_datasets, get_qa_long_gen_datasets
from .qa_ppl import get_qa_ppl_datasets, get_qa_long_ppl_datasets
from ...paths import ROOT_DIR

def get_all_datasets(name, path, langs, qtypes):
Expand Down Expand Up @@ -102,11 +102,16 @@ def get_selected_datasets(setting_func, name, path, langs, qtypes):
rzy_qa_gen = get_selected_datasets([get_qa_gen_datasets], 'rzy', f'{ROOT_DIR}data/opseval/rzy/splitted_v2', langs=['zh'], qtypes=None)
rzy_qa = rzy_qa_ppl + rzy_qa_gen

zedx_qa_ppl = get_selected_datasets([get_qa_long_ppl_datasets], 'zedx', f'{ROOT_DIR}data/opseval/zedx', langs=['zh'], qtypes=None)
zedx_qa_gen = get_selected_datasets([get_qa_long_gen_datasets], 'zedx', f'{ROOT_DIR}data/opseval/zedx', langs=['zh'], qtypes=None)
zedx_qa = zedx_qa_ppl + zedx_qa_gen

all_ppl_mc = zte_mc_ppl + oracle_mc_ppl + owl_mc_ppl + network_mc_ppl + company_mc_ppl
all_gen_mc = zte_mc_gen + oracle_mc_gen + owl_mc_gen + network_mc_gen + company_mc_gen

all_ppl_qa = owl_qa_ppl + company_qa_ppl
all_gen_qa = owl_qa_gen + company_qa_gen
all_ppl_qa = owl_qa_ppl + company_qa_ppl + zedx_qa_ppl
all_gen_qa = owl_qa_gen + company_qa_gen + zedx_qa_gen
all_qa = all_ppl_qa + all_gen_qa

ceval_mc_ppl = get_selected_datasets([get_mc_ppl_datasets], 'ceval', f'{ROOT_DIR}data/opseval/ceval/splitted_dev', langs=['zh'], qtypes=['single'])
ceval_mc_gen = get_selected_datasets([get_mc_gen_datasets], 'ceval', f'{ROOT_DIR}data/opseval/ceval/splitted_dev', langs=['zh'], qtypes=['single'])
Expand Down
42 changes: 41 additions & 1 deletion configs/datasets/opseval/qa_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def get_qa_gen_datasets(dataset_name, path, langs=['zh'], qtypes=None):
inferencer=get_gen_inferencer(sc=False),
),
# eval_cfg=dict(evaluator=dict(type=BleuRougeEvaluator))
eval_cfg=dict(evaluator=dict(type=OpsEvalGenQAEvaluator, language=lang))
eval_cfg=dict(evaluator=dict(type=OpsEvalGenQAEvaluator, language=lang), need_ragas=True)
)
for shot_abbr, shot_hint_id, retriever_dict in zip(
['Zero-shot', '3-shot'],
Expand All @@ -60,5 +60,45 @@ def get_qa_gen_datasets(dataset_name, path, langs=['zh'], qtypes=None):
selected.extend([d for d in datasets if f'{lang}' in d['abbr'].replace('_', '-').split('-')])
return selected

def get_qa_long_gen_datasets(dataset_name, path, langs=['zh'], qtypes=None):
naive_gen_datasets = [
dict(
type=OpsEvalQADataset,
abbr=f'{dataset_name}-qa-{shot_abbr}-{lang}-sc-gen',
path=path,
name=f'{dataset_name}_qa_{lang}',
reader_cfg=qa_gen_reader_cfg,
infer_cfg=dict(
ice_template=qa_gen_ice_template(prompt_hint, answer_hint),
prompt_template=qa_gen_prompt_template(prompt_hint, answer_hint),
retriever=retriever_dict,
inferencer=get_gen_inferencer(sc=False),
),
# eval_cfg=dict(evaluator=dict(type=BleuRougeEvaluator))
eval_cfg=dict(evaluator=dict(type=OpsEvalGenQAEvaluator, language=lang), need_ragas=True)
)
for shot_abbr, shot_hint_id, retriever_dict in zip(
['Zero-shot', '3-shot'],
[0, 1],
[dict(type=ZeroRetriever), dict(type=FixKRetriever, fix_id_list=[0,1,2])]
)
for lang, prompt_hint, answer_hint in zip(
['zh', 'en'],
[
f"你是一名运维专家,请回答下面这个问题:\n",
f"You are an IT operations expert, please answer the following question: \n"
],
[
"答案:",
"Answer:"
]
)
]
datasets = naive_gen_datasets
selected = []
for lang in langs:
selected.extend([d for d in datasets if f'{lang}' in d['abbr'].replace('_', '-').split('-')])
return selected


zjyd_qa_gen_datasets = get_qa_gen_datasets('zjyd', f'{ROOT_DIR}data/opseval/zjyd/')
39 changes: 39 additions & 0 deletions configs/datasets/opseval/qa_ppl.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,44 @@ def get_qa_ppl_datasets(dataset_name, path, langs=['zh'], qtypes=None):
selected.extend([d for d in datasets if f'{lang}' in d['abbr'].replace('_', '-').split('-')])
return selected

def get_qa_long_ppl_datasets(dataset_name, path, langs=['zh'], qtypes=None):
naive_ppl_datasets = [
dict(
type=OpsEvalQADataset,
abbr=f'{dataset_name}-qa-{shot_abbr}-{lang}-sc-ppl',
path=path,
name=f'{dataset_name}_qa_{lang}',
reader_cfg=qa_ppl_reader_cfg,
infer_cfg=dict(
ice_template=qa_ppl_ice_template(prompt_hint, answer_hint),
prompt_template=qa_ppl_prompt_template(prompt_hint, answer_hint),
retriever=retriever_dict,
inferencer=get_ppl_qa_inferencer(),
),
eval_cfg=dict(evaluator=dict(type=MeanEvaluator, field_name='PPL'))
)
for shot_abbr, shot_hint_id, retriever_dict in zip(
['Zero-shot', '3-shot'],
[0, 1],
[dict(type=ZeroRetriever), dict(type=FixKRetriever, fix_id_list=[0,1,2])]
)
for lang, prompt_hint, answer_hint in zip(
['zh', 'en'],
[
f"你是一名运维专家,请回答下面这个问题:\n",
f"You are an IT operations expert, please answer the following question: \n"
],
[
"答案:",
"Answer:"
]
)
]
datasets = naive_ppl_datasets
selected = []
for lang in langs:
selected.extend([d for d in datasets if f'{lang}' in d['abbr'].replace('_', '-').split('-')])
return selected


zjyd_qa_ppl_datasets = get_qa_ppl_datasets('zjyd', f'{ROOT_DIR}data/opseval/zjyd/')
73 changes: 73 additions & 0 deletions configs/tests/test_ragas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from mmengine.config import read_base
from opencompass.partitioners import SizePartitioner, NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask

with read_base():
# Datasets
from ..datasets.opseval.datasets import owl_qa_gen, rzy_qa_gen, zedx_qa_gen, owl_qa_ppl, rzy_qa_ppl
# Models
from ..local_models.google.t5 import t5_base
from ..local_models.bert.bert import bert_large_cased
from ..local_models.qwen.qwen import qwen1_5_chat_models

from ..paths import ROOT_DIR


datasets = [
*owl_qa_gen,
*owl_qa_ppl,
*rzy_qa_gen,
*rzy_qa_ppl,
*zedx_qa_gen,
]

datasets = [
dataset for dataset in datasets if 'Zero-shot' in dataset['abbr'] and 'zh' in dataset['abbr']
]

models = [
# t5_base,
# bert_large_cased,
model for model in qwen1_5_chat_models if '14' in model['abbr']
# *vicuna_bases,
# *internlm2_bases,
# *yi_bases,
# mistral_7b
]

for model in models:
model['run_cfg'] = dict(num_gpus=1, num_procs=1)
pass

for dataset in datasets:
dataset['sample_setting'] = dict()
dataset['infer_cfg']['inferencer']['save_every'] = 8
dataset['infer_cfg']['inferencer']['sc_size'] = 2
dataset['infer_cfg']['inferencer']['max_token_len'] = 200
dataset['eval_cfg']['sc_size'] = 2
dataset['sample_setting'] = dict(sample_size=5) # !!!WARNING: Use for testing only!!!


infer = dict(
partitioner=dict(
# type=SizePartitioner,
# max_task_size=100,
# gen_task_coef=1,
type=NaivePartitioner
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
max_workers_per_gpu=1,
task=dict(type=OpenICLInferTask),
),
)

eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
22 changes: 22 additions & 0 deletions configs/xz/run_qwen1_5_72b_chat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/sh
set -x
set -e

MODEL_PATH="/mnt/tenant-home_speed/gaozhengwei/projects/LLM/models/Qwen/Qwen1.5-72B-Chat"
PORT=12310
GPUS=("0,1" "2,3" "4,5" "6,7")

source /root/miniconda3/etc/profile.d/conda.sh && conda activate vllm
for i in {0..3}; do
CUDA_VISIBLE_DEVICES=${GPUS[$i]} ray start --head --port $((8012 + $i)) --num-cpus 2
CUDA_VISIBLE_DEVICES=${GPUS[$i]} ray start --address=localhost:$((8012 + $i)) --num-cpus 2
CUDA_VISIBLE_DEVICES=${GPUS[$i]} RAY_ADDRESS=localhost:$((8012 + $i)) python -m vllm.entrypoints.openai.api_server \
--model $MODEL_PATH --host 127.0.0.1 --port $(($PORT + $i)) --tensor-parallel-size 2 --gpu-memory-utilization 0.98 --trust-remote-code --max-model-len 2048 & pid[$i]=$!
echo "port=$(($PORT + $i)), pid=${pid[$i]}"
done
echo "[VLLM] All backend servers have been started!!!"

wait
echo "[VLLM] All backend services have been successfully killed!!!"
ray stop
echo "[VLLM] Ray stoped"
23 changes: 13 additions & 10 deletions configs/xz/runconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

with read_base():
# Datasets
from ..datasets.opseval.datasets import ceval_mc_ppl, network_mc_ppl, zte_mc_ppl, owl_mc_ppl, ceval_mc_gen, network_mc_gen, zte_mc_gen, owl_mc_gen, owl_qa_gen, owl_qa_ppl, rzy_qa_gen, rzy_qa_ppl, oracle_mc_gen, oracle_mc_ppl, company_mc_gen, company_mc_ppl
from ..datasets.opseval.datasets import ceval_mc_ppl, network_mc_ppl, zte_mc_ppl, owl_mc_ppl, ceval_mc_gen, network_mc_gen, zte_mc_gen, owl_mc_gen, owl_qa_gen, owl_qa_ppl, rzy_qa_gen, rzy_qa_ppl, zedx_qa_gen, zedx_qa_ppl, oracle_mc_gen, oracle_mc_ppl, company_mc_gen, company_mc_ppl
from ..datasets.simple_qa.owl_qa import owl_qa_datasets
from ..datasets.ppl_qa.owl_qa import owl_ppl_qa_datasets
from ..datasets.simple_qa.rzy_qa import rzy_qa_datasets
Expand All @@ -25,8 +25,8 @@
model_dataset_combinations = [{
'models': [dict(
type=VLLM,
abbr='nm_qwen1.5_32b_dsir_new_10000_full_owl_network_sft_800steps',
path='/mnt/home/opsfm-xz/sft_checkpoint/xz/qwen1.5-32b-dsir_new_10000-full-owl-network-sft-2000steps/checkpoint-800/merged_model',
abbr='nm_qwen1.5_32b_zedx_full_2000step_sft_2000step',
path='/mnt/tenant-home_speed/xz/sft_checkpoint/qwen1.5-32b-zedx-full-2000step-sft-2000step/merged_model',
max_out_len=400,
max_seq_len=2048,
batch_size=8,
Expand All @@ -47,9 +47,9 @@
}, {
'models': [dict(
type=HuggingFaceCausalLM,
abbr='nm_qwen1.5_32b_dsir_new_10000_full_owl_network_sft_800steps',
path='/mnt/home/opsfm-xz/sft_checkpoint/xz/qwen1.5-32b-dsir_new_10000-full-owl-network-sft-2000steps/checkpoint-800/merged_model',
tokenizer_path='/mnt/home/opsfm-xz/sft_checkpoint/xz/qwen1.5-32b-dsir_new_10000-full-owl-network-sft-2000steps/checkpoint-800/merged_model',
abbr='nm_qwen1.5_32b_zedx_full_2000step_sft_2000step',
path='/mnt/tenant-home_speed/xz/sft_checkpoint/qwen1.5-32b-zedx-full-2000step-sft-2000step/merged_model',
tokenizer_path='/mnt/tenant-home_speed/xz/sft_checkpoint/qwen1.5-32b-zedx-full-2000step-sft-2000step/merged_model',
tokenizer_kwargs=dict(padding_side='left',
truncation_side='left',
trust_remote_code=True,
Expand All @@ -74,22 +74,25 @@
zeroshot_datasets = []
fewshot_datasets = []

for dataset in [*ceval_mc_ppl,*network_mc_ppl,*zte_mc_ppl,*owl_mc_ppl,*oracle_mc_ppl,*company_mc_ppl,*ceval_mc_gen,*network_mc_gen,*zte_mc_gen,*owl_mc_gen,*oracle_mc_gen,*company_mc_gen,*owl_qa_gen,*owl_qa_ppl,*rzy_qa_gen,*rzy_qa_ppl]:
for dataset in [*ceval_mc_ppl,*network_mc_ppl,*zte_mc_ppl,*owl_mc_ppl,*oracle_mc_ppl,*company_mc_ppl,*ceval_mc_gen,*network_mc_gen,*zte_mc_gen,*owl_mc_gen,*oracle_mc_gen,*company_mc_gen,*zedx_qa_gen,*zedx_qa_ppl]:
# dataset['path'] = dataset['path'].replace('/mnt/mfs/opsgpt/evaluation','/mnt/home/opseval/evaluation/')
dataset['sample_setting'] = dict()
dataset['infer_cfg']['inferencer']['save_every'] = 8
dataset['infer_cfg']['inferencer']['sc_size'] = 1
dataset['infer_cfg']['inferencer']['max_out_len'] = 20
# dataset['infer_cfg']['inferencer']['generation_kwargs'] = {'stopping_criteria': ['<|im_end|>', '<|endoftext|>']}
if 'qa' in dataset['abbr'].replace('-', '_').split('_'):
dataset['infer_cfg']['inferencer']['max_out_len'] = 50
if 'zedx' in dataset['abbr']:
dataset['infer_cfg']['inferencer']['max_out_len'] = 100
else:
dataset['infer_cfg']['inferencer']['max_out_len'] = 50
if 'en' in dataset['abbr'].replace('-', '_').split('_'):
continue
if 'sc+cot' in dataset['abbr']:
continue
if 'mc' in dataset['abbr'].replace('-', '_').split('_'):
dataset['sample_setting']= dict(sample_size=500)
if 'owl_qa' in dataset['abbr']:
if 'qa' in dataset['abbr'].replace('-', '_').split('_') and 'ppl' not in dataset['abbr']:
dataset['sample_setting']= dict(sample_size=500)
dataset['eval_cfg']['sc_size'] = 1
if 'network' in dataset['abbr']:
Expand Down Expand Up @@ -127,6 +130,6 @@
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=32,
max_num_workers=8,
task=dict(type=OpenICLEvalTask)),
)
11 changes: 7 additions & 4 deletions configs/xz/runconfig_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

with read_base():
# Datasets
from ..datasets.opseval.datasets import ceval_mc_ppl, network_mc_ppl, zte_mc_ppl, owl_mc_ppl, ceval_mc_gen, network_mc_gen, zte_mc_gen, owl_mc_gen, owl_qa_gen, owl_qa_ppl, rzy_qa_gen, rzy_qa_ppl, oracle_mc_gen, oracle_mc_ppl, company_mc_gen, company_mc_ppl
from ..datasets.opseval.datasets import ceval_mc_ppl, network_mc_ppl, zte_mc_ppl, owl_mc_ppl, ceval_mc_gen, network_mc_gen, zte_mc_gen, owl_mc_gen, owl_qa_gen, owl_qa_ppl, rzy_qa_gen, rzy_qa_ppl, zedx_qa_gen, zedx_qa_ppl, oracle_mc_gen, oracle_mc_ppl, company_mc_gen, company_mc_ppl
from ..datasets.simple_qa.owl_qa import owl_qa_datasets
from ..datasets.ppl_qa.owl_qa import owl_ppl_qa_datasets
from ..datasets.simple_qa.rzy_qa import rzy_qa_datasets
Expand Down Expand Up @@ -41,14 +41,17 @@
dataset['infer_cfg']['inferencer']['max_out_len'] = 20
# dataset['infer_cfg']['inferencer']['generation_kwargs'] = {'stopping_criteria': ['<|im_end|>', '<|endoftext|>']}
if 'qa' in dataset['abbr'].replace('-', '_').split('_'):
dataset['infer_cfg']['inferencer']['max_out_len'] = 50
if 'zedx' in dataset['abbr']:
dataset['infer_cfg']['inferencer']['max_out_len'] = 100
else:
dataset['infer_cfg']['inferencer']['max_out_len'] = 50
if 'en' in dataset['abbr'].replace('-', '_').split('_'):
continue
if 'sc+cot' in dataset['abbr']:
continue
if 'mc' in dataset['abbr'].replace('-', '_').split('_'):
dataset['sample_setting']= dict(sample_size=500)
if 'owl_qa' in dataset['abbr']:
if 'qa' in dataset['abbr'].replace('-', '_').split('_') and 'ppl' not in dataset['abbr']:
dataset['sample_setting']= dict(sample_size=500)
dataset['eval_cfg']['sc_size'] = 1
if 'network' in dataset['abbr']:
Expand Down Expand Up @@ -86,6 +89,6 @@
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=32,
max_num_workers=8,
task=dict(type=OpenICLEvalTask)),
)
29 changes: 22 additions & 7 deletions opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,24 @@ def score(self, predictions: List, references: List) -> dict:
correct += not_sc(pred, ans)
sc_correct += sc_cot(pred, ans)
return {
'Accuracy': correct / tot * 100,
'SC-Accuracy': sc_correct / tot * 100,
'accuracy': correct / tot * 100,
'sc-accuracy': sc_correct / tot * 100,
}

class OpsEvalGenQAEvaluator(BaseEvaluator):

def __init__(self, language='en'):
def __init__(self,
language='en',
ragas_config=None,
):
super().__init__()
self.language = language
self.ragas_config = ragas_config
if self.ragas_config == None:
self.ragas_config = dict(
api_ip='localhost',
api_port=12310, # 12310 ~ 12313
)

def score(self, predictions: List, references: List, test_set: List) -> dict:
tot_bleu, tot_rouge = 0, 0
Expand All @@ -136,7 +145,9 @@ def score(self, predictions: List, references: List, test_set: List) -> dict:
report = {
"bleu": tot_bleu / len(predictions),
"rouge": tot_rouge / len(predictions),
"ragas": ragas_report
"ragas_score": ragas_report["score"],
"ragas_acc": ragas_report["accuracy"],
"ragas_report": ragas_report
}
return report

Expand Down Expand Up @@ -166,7 +177,7 @@ def get_ragas_score(self, predictions, references, test_set) -> dict:
for idx, (question, ref) in enumerate(zip(test_set['question'], references))]
answers = [{"id": idx, "question": question, "answer": ans}
for idx, (question, ans) in enumerate(zip(test_set['question'], predictions))]
report = calculate_score(reference, answers)
report = calculate_score(reference, answers, self.ragas_config)
return report

class OpsEvalRagasEvaluator(BaseEvaluator):
Expand All @@ -180,5 +191,9 @@ def score(self, predictions: List, references: List, test_set: List) -> dict:
for idx, (question, ref) in enumerate(zip(test_set['question'], references))]
answers = [{"id": idx, "question": question, "answer": ans}
for idx, (question, ans) in enumerate(zip(test_set['question'], predictions))]
report = calculate_score(reference, answers)
return report
try:
report = calculate_score(reference, answers)
except Exception as err:
print(f"[OpsEvalRagasEvaluator] {err}")
report = {}
return report
Loading

0 comments on commit ec877a5

Please sign in to comment.