forked from microsoft/muzic
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheval_genre.py
104 lines (82 loc) · 3.21 KB
/
eval_genre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#
from fairseq.models.roberta import RobertaModel
import numpy as np
import torch
import torch.nn.functional as F
import sklearn.metrics
import sys
import os
max_length = 8192 if 'disable_cp' not in os.environ else 1024
batch_size = 4
n_folds = 1
scores = dict()
for score in ["f1_score", "roc_auc_score"]:
for average in ["macro", "micro", "weighted", "samples"]:
scores[score + "_" + average] = []
def label_fn(label, label_dict):
return label_dict.string(
[label + label_dict.nspecial]
)
for i in range(n_folds):
print('loading model and data')
print('start evaluating fold {}'.format(i))
roberta = RobertaModel.from_pretrained(
'.',
checkpoint_file=sys.argv[1].replace('x', str(i)),
data_name_or_path=sys.argv[2].replace('x', str(i)),
user_dir='musicbert'
)
num_classes = 13 if 'topmagd' in sys.argv[1] else 25
roberta.task.load_dataset('valid')
dataset = roberta.task.datasets['valid']
label_dict = roberta.task.label_dictionary
pad_index = label_dict.pad()
roberta.cuda()
roberta.eval()
cnt = 0
y_true = []
y_pred = []
def padded(seq):
pad_length = max_length - seq.shape[0]
assert pad_length >= 0
return np.concatenate((seq, np.full((pad_length,), pad_index, dtype=seq.dtype)))
for i in range(0, len(dataset), batch_size):
target = np.vstack(tuple(padded(dataset[j]['target'].numpy()) for j in range(
i, i + batch_size) if j < len(dataset)))
target = torch.from_numpy(target)
target = F.one_hot(target.long(), num_classes=(num_classes + 4))
target = target.sum(dim=1)[:, 4:]
source = np.vstack(tuple(padded(dataset[j]['source'].numpy()) for j in range(
i, i + batch_size) if j < len(dataset)))
source = torch.from_numpy(source)
output = torch.sigmoid(roberta.predict(
'topmagd_head' if 'topmagd' in sys.argv[1] else 'masd_head', source, True))
y_true.append(target.detach().cpu().numpy())
y_pred.append(output.detach().cpu().numpy())
print('evaluating: {:.2f}%'.format(
i / len(dataset) * 100), end='\r', flush=True)
y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)
print()
for i in range(num_classes):
print(i, label_fn(i, label_dict))
print(y_true.shape)
print(y_pred.shape)
# with open('genre.npy', 'wb') as f:
# np.save(f, {'y_true': y_true, 'y_pred': y_pred})
for score in ["f1_score", "roc_auc_score"]:
for average in ["macro", "micro", "weighted", "samples"]:
try:
y_score = np.round(y_pred) if score == "f1_score" else y_pred
result = sklearn.metrics.__dict__[score](
y_true, y_score, average=average)
print("{}_{}:".format(score, average), result)
scores[score + "_" + average].append(result)
except BaseException as e:
print("{}_{}:".format(score, average), e)
scores[score + "_" + average].append(None)
print(scores)
for k in scores:
print(k, sum(scores[k]) / len(scores[k]))