-
Notifications
You must be signed in to change notification settings - Fork 1
/
summeval.py
134 lines (105 loc) · 4.98 KB
/
summeval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json, typing, os
import pandas
from eval_utils import eval_and_write
import evaluate
def clean_text(s: str):
s = s.replace("\t", " ")
s = s.strip()
return s
def pool_human_rating(
human_ratings: typing.List[dict],
pool_method: str = "mean") \
-> dict:
# input list:
# [{'coherence': 2, 'consistency': 1, 'fluency': 4, 'relevance': 2},
# {'coherence': 1, 'consistency': 1, 'fluency': 2, 'relevance': 1},
# {'coherence': 1, 'consistency': 1, 'fluency': 3, 'relevance': 2}]
df = pandas.DataFrame(human_ratings)
if pool_method == "mean":
q = df.mean()
return q.to_dict()
# ratings = {}
# for human_metric in ['coherence', 'consistency', 'fluency', 'relevance']:
# tmp = 0
# for i in range(3):
# tmp += human_ratings[i][human_metric]
# if pool_method == "mean":
# ratings[human_metric] = tmp/3
# return ratings
def load_summeval(paired_jsonl):
human_metrics = ['coherence', 'consistency', 'fluency', 'relevance']
with open(paired_jsonl, 'r', encoding='utf-8') as fd:
dataset = [json.loads(line) for line in fd]
df = pandas.DataFrame(dataset)
# return df
# df.columns ==>
# ['id', 'decoded', 'expert_annotations', 'turker_annotations',
# 'references', 'model_id', 'filepath', 'metric_scores_1',
# 'metric_scores_6', 'metric_scores_11', 'text']
# process nested precalcualted metrics
tdf = df['metric_scores_1'].to_list()
for row in tdf:
row.update(row['rouge'])
row['supert'] = row['supert'][0]
del row['rouge']
df = pandas.concat([df, pandas.DataFrame(tdf)], axis=1)
for refId in range(11):
df[f"ReferenceSummary_{refId}"] = df["text"] # place holder
for human_metric in human_metrics:
df[human_metric] = df["id"] # place holder
# clean up
df = df.rename(columns={'decoded': 'SystemSummary', 'text': 'ArticleText', 'model_id': 'system'})
for index, row in df.iterrows():
for refId in range(11):
df.at[index, f"ReferenceSummary_{refId}"] = clean_text(row["references"][refId])
pooled_human_ratings = pool_human_rating(row['expert_annotations'])
for human_metric in human_metrics:
df.at[index, human_metric] = pooled_human_ratings[human_metric]
for column in ['ArticleText', 'SystemSummary']:
df.at[index, column] = clean_text(row[column])
df = df.drop(
columns=['filepath', 'metric_scores_1', 'metric_scores_6', 'metric_scores_11', 'expert_annotations',
'turker_annotations', 'references'])
return df
# In [7]: df.iloc[1]["expert_annotations"]
# Out[7]:
# [{'coherence': 3, 'consistency': 5, 'fluency': 5, 'relevance': 2},
# {'coherence': 2, 'consistency': 5, 'fluency': 5, 'relevance': 3},
# {'coherence': 2, 'consistency': 5, 'fluency': 5, 'relevance': 3}]
def main(exp_config: dict):
dataset_name = exp_config["dataset_name"]
dataset_df = load_summeval(exp_config["data_path"])
eval_and_write(dataset_name, dataset_df, exp_config)
if __name__ == "__main__":
import os, functools
exp_config = {
# about the dataset and dataframe
"dataset_name": "summeval",
"human_metrics": ["consistency", "relevance", "coherence", "fluency"],
"docID_column": "id",
"document_column": "ArticleText",
"system_summary_column": "SystemSummary",
"reference_summary_column": "ReferenceSummary_0", # the id ranges from 0 to 10
# about the experiments
"nlg_metrics" : {
"bleurt": evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric').compute,
"rouge": functools.partial(evaluate.load("rouge").compute, use_aggregator=False),
"bertscore": functools.partial(evaluate.load("bertscore").compute, lang='en', use_fast_tokenizer=True),
},
"corr_metrics" : ["spearman", "pearson", "kendalltau"],
"approaches": ["trad", "new"],
"eval_levels": ["summary", "system"],
"data_path": os.path.join(path, "dataloader/summeval_annotations.aligned.paired.scored.jsonl"),
"result_path_root": "./results/",
"precal_metrics": [ # keys from original SummEval json file
'rouge_1_precision', 'rouge_1_recall', 'rouge_1_f_score',
'rouge_2_precision', 'rouge_2_recall', 'rouge_2_f_score',
'rouge_l_precision', 'rouge_l_recall', 'rouge_l_f_score',
'rouge_we_1_p', 'rouge_we_1_r', 'rouge_we_1_f',
'rouge_we_2_p', 'rouge_we_2_r', 'rouge_we_2_f',
'meteor', 'cider', 's3_pyr', 's3_resp',
'mover_score', 'sentence_movers_glove_sms', 'bleu',
'bert_score_precision', 'bert_score_recall', 'bert_score_f1',
'blanc', 'summaqa_avg_prob', 'summaqa_avg_fscore', 'supert']
}
main(exp_config)