-
Notifications
You must be signed in to change notification settings - Fork 0
/
MFCCvsNoise1.py
112 lines (90 loc) · 5.3 KB
/
MFCCvsNoise1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import sys
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
from glob import glob
def load_data(file_name):
print('FILE EXIST')
featuresDF = pd.read_csv(file_name, sep=';', dtype={'STUDENT': str})
return featuresDF
def test_classifier(clf_name, clf, X_train, y_train, X_test, y_test):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf_name)
print(accuracy_score(y_test, y_pred))
return accuracy_score(y_test, y_pred)
def cross_validate(clf, X, y, features):
group_kfold = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=5)
cv_scores = [clf.fit(X[train], y[train]).score(X[test], y[test])
for train, test in group_kfold.split(X, y, features['FILE'])]
return cv_scores
if __name__ == '__main__':
# This value is passed from the *.sh file, it is useful for passing input to the code when sending to the HPC
# multiple jobs at once.
idx = int(sys.argv[1])
feature_path = '../mfcc_data'
# feature_file = glob(feature_path+'/*.csv')[idx]
feature_file = sorted(glob(feature_path + '/*.csv'))
feature_file_sorted = sorted(feature_file, key=lambda x: int(x.split('MFCC_')[1].split('.csv')[0]))
print(feature_file_sorted[idx])
feature_file = feature_file_sorted[idx]
features = load_data(feature_file)
no_mfcc = feature_file.split('\\')[-1].strip('.csv').split('_')[-1]
results_file = 'resultsMFCC_{}.csv'.format(no_mfcc)
print(results_file)
results = pd.DataFrame(columns=['No_MFCC', 'Classifier', 'Accuracy', '5Fold_cv_MEAN', '5Fold_CV', 'Time_sec'])
# create design matrix X and target vector y
X = features.filter(like='MFCC').values
y = features['LABEL_GROUP'].values
# It is important to use "GroupShuffleSplit" and the "group" parameter of the "split" function. This ensures that
# all samples from an audio file are either in the training or the test set. Otherwise, it would be like cheating
# as MFCC coefficients are calculated with 75% overlapping which means that there are some samples duplicated
# which could end up in the training and the test set if the "group" parameter is not used.
sss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
sss.get_n_splits(X, y, features['FILE'])
for train_index, test_index in sss.split(X, y, features['FILE']):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(pd.DataFrame(y_train)[0].value_counts())
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
accuracy = test_classifier(type(clf).__name__+str(clf.n_neighbors), clf, X_train, y_train, X_test, y_test)
training_time = time.time() - start_time
accuracy_cv = cross_validate(clf, X, y, features)
results.loc[len(results)] = [no_mfcc, type(clf).__name__+str(clf.n_neighbors), accuracy, np.mean(accuracy_cv), accuracy_cv, training_time]
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)
accuracy = test_classifier(type(clf).__name__+str(clf.n_neighbors), clf, X_train, y_train, X_test, y_test)
training_time = time.time() - start_time
accuracy_cv = cross_validate(clf, X, y, features)
results.loc[len(results)] = [no_mfcc, type(clf).__name__+str(clf.n_neighbors), accuracy, np.mean(accuracy_cv), accuracy_cv, training_time]
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
accuracy = test_classifier(type(clf).__name__+str(clf.n_neighbors), clf, X_train, y_train, X_test, y_test)
training_time = time.time() - start_time
accuracy_cv = cross_validate(clf, X, y, features)
results.loc[len(results)] = [no_mfcc, type(clf).__name__+str(clf.n_neighbors), accuracy, np.mean(accuracy_cv), accuracy_cv, training_time]
start_time = time.time()
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
accuracy = test_classifier(type(clf).__name__, clf, X_train, y_train, X_test, y_test)
training_time = time.time() - start_time
accuracy_cv = cross_validate(clf, X, y, features)
results.loc[len(results)] = [no_mfcc, type(clf).__name__, accuracy, np.mean(accuracy_cv), accuracy_cv, training_time]
start_time = time.time()
clf = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=-1)
accuracy = test_classifier(type(clf).__name__, clf, X_train, y_train, X_test, y_test)
training_time = time.time() - start_time
accuracy_cv = cross_validate(clf, X, y, features)
results.loc[len(results)] = [no_mfcc, type(clf).__name__, accuracy, np.mean(accuracy_cv), accuracy_cv, training_time]
start_time = time.time()
clf = xgb.XGBClassifier(tree_method='approx', random_state=0, n_jobs=-1)
accuracy = test_classifier(type(clf).__name__, clf, X_train, y_train, X_test, y_test)
training_time = time.time() - start_time
accuracy_cv = cross_validate(clf, X, y, features)
results.loc[len(results)] = [no_mfcc, type(clf).__name__, accuracy, np.mean(accuracy_cv), accuracy_cv, training_time]
results.to_csv('./results_1/'+results_file, sep=';', float_format='%.4f')