Skip to content

Commit

Permalink
added ragas to OpsEvalGenQAEvaluator
Browse files Browse the repository at this point in the history
  • Loading branch information
NickLennonLiu committed Apr 9, 2024
1 parent fa78e22 commit 906e3bb
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 67 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ configs/openai_key.py
configs/paths.py
configs/api_key.py
configs/secrets.py
configs/ragas_config.toml
configs/datasets/log.json
configs/eval_debug*.py
configs/viz_*.py
Expand Down
2 changes: 1 addition & 1 deletion configs/datasets/opseval/qa_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever, ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer, SCInferencer, CoTInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator, BleuRougeEvaluator, OpsEvalGenQAEvaluator
from opencompass.openicl.icl_evaluator import AccEvaluator, BleuRougeEvaluator, OpsEvalGenQAEvaluator, OpsEvalRagasEvaluator
from opencompass.utils.text_postprocessors import first_capital_postprocess_multi
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
Expand Down
4 changes: 3 additions & 1 deletion configs/lyh/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
]

# Remove COT and 3-shot datasets
datasets = [dataset for dataset in datasets if 'cot' not in dataset['abbr'] and '3-shot' not in dataset['abbr']]
# datasets = [dataset for dataset in datasets if 'cot' not in dataset['abbr'] and '3-shot' not in dataset['abbr']]
# Naive COT only
datasets = [dataset for dataset in datasets if 'cot' in dataset['abbr'] and '3-shot' not in dataset['abbr']]

models = [
claude_3_opus
Expand Down
2 changes: 1 addition & 1 deletion configs/lyh/vllm_qa_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,6 @@
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=32,
max_num_workers=1,
task=dict(type=OpenICLEvalTask)),
)
34 changes: 23 additions & 11 deletions opencompass/openicl/icl_evaluator/opseval_gen_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,19 @@ def __init__(self, language='en'):
super().__init__()
self.language = language

def score(self, predictions: List, references: List) -> dict:
def score(self, predictions: List, references: List, test_set: List) -> dict:
tot_bleu, tot_rouge = 0, 0
for pred, ans in zip(predictions, references):
bleu_score, rouge_score = self.get_rouge_bleu(pred, ans, self.language)
tot_bleu += bleu_score
tot_rouge += rouge_score
return {
ragas_report = self.get_ragas_score(predictions, references, test_set)
report = {
"bleu": tot_bleu / len(predictions),
"rouge": tot_rouge / len(predictions)
"rouge": tot_rouge / len(predictions),
"ragas": ragas_report
}
return report

def get_rouge_bleu(self, pred, answer, language='en'):
rouge = Rouge()
Expand All @@ -157,16 +160,25 @@ def clean_word(words):
bleu_score, rouge_score = 0.0, 0.0
return bleu_score, rouge_score

def get_ragas_score(self, predictions, references, test_set) -> dict:
from opencompass.ragas.judge import calculate_score
reference = [{"id": idx, "question": question, "answer": ref}
for idx, (question, ref) in enumerate(zip(test_set['question'], references))]
answers = [{"id": idx, "question": question, "answer": ans}
for idx, (question, ans) in enumerate(zip(test_set['question'], predictions))]
report = calculate_score(reference, answers)
return report

class OpsEvalRagasEvaluator(BaseEvaluator):

def __init__(self):
def __init__(self, language='en'):
super().__init__()

def score(self, predictions: List, references: List) -> dict:
from opencompass.ragas.judge import calculate_scores
reference = {"id": idx, "answer": ref
for idx, ref in enumerate(references)}
answers = {"id": idx, "answer": ans
for idx, ans in enumerate(predictions)}
report = calculate_scores(reference, answers)
def score(self, predictions: List, references: List, test_set: List) -> dict:
from opencompass.ragas.judge import calculate_score
reference = [{"id": idx, "question": question, "answer": ref}
for idx, (question, ref) in enumerate(zip(test_set['question'], references))]
answers = [{"id": idx, "question": question, "answer": ans}
for idx, (question, ans) in enumerate(zip(test_set['question'], predictions))]
report = calculate_score(reference, answers)
return report
6 changes: 3 additions & 3 deletions opencompass/ragas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from config import *
from judge import *
from metric import *
from .config import *
from .judge import *
from .metric import *
6 changes: 4 additions & 2 deletions opencompass/ragas/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
from pip._vendor import tomli
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings
from mmengine.config import read_base

CONFIG_PATH = os.getenv('CONFIG_PATH', 'config/config.toml')
CURRENT_PATH = os.path.dirname(__file__)
CONFIG_PATH = os.path.join(CURRENT_PATH, '../../configs', 'ragas_config.toml')

logger = logging.getLogger(__name__)


def load_config() -> dict:
if not os.path.exists(CONFIG_PATH):
logger.error('Config file does not exist')
logger.error(f'Config file {CONFIG_PATH} does not exist')
sys.exit(1)

with open(CONFIG_PATH, 'r') as f:
Expand Down
46 changes: 0 additions & 46 deletions opencompass/ragas/config/config.toml

This file was deleted.

4 changes: 2 additions & 2 deletions opencompass/ragas/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import argparse
import pandas as pd
from config import config, load_llm, load_embeddings
from opencompass.ragas.config import config, load_llm, load_embeddings
from datasets import Dataset

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -141,7 +141,7 @@ def compute_scores(df: pd.DataFrame) -> list[dict]:
os.environ["LANGCHAIN_ENDPOINT"] = langsmith_config.get('endpoint')

from ragas import evaluate, RunConfig
from metric import answer_correctness
from opencompass.ragas.metric import answer_correctness
from langchain.callbacks.tracers import LangChainTracer

new_df = df.copy()
Expand Down

0 comments on commit 906e3bb

Please sign in to comment.