-
Notifications
You must be signed in to change notification settings - Fork 1
/
models_comparison.py
121 lines (94 loc) · 4.2 KB
/
models_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import pandas as pd
from sklearn import metrics
import scipy.stats as ss
import matplotlib.pyplot as plt
import scikit_posthocs as sp
from natsort import natsorted
from plots import save_plot
plots_dir = os.path.join(os.getcwd(), 'plots')
os.makedirs(plots_dir, exist_ok=True)
def merge_csv_files(indices, files_list, path=None):
"""
:param indices: list, numpy, list of indices
:param path: str path to csv files
:param files_list: str list of csv files
:return: a data frame
"""
file_list = natsorted(files_list)
anomaly_df = pd.DataFrame(index=indices)
for f in file_list:
temp_df = pd.read_csv(os.path.join(path, f))
col_name = f.strip('.csv')[-4:]
anomaly_df[col_name] = temp_df.Anomaly.values
return anomaly_df
def concat_dataframes(df_list):
"""
:param df_list: a list of dataframes to concat files
:return: a concated dataframe
"""
indices = df_list[0].index
for df in df_list:
df.reset_index(drop=True, inplace=True)
concat_df = pd.concat(df_list, axis=1)
concat_df.set_index(indices, drop=True, inplace=True)
return concat_df
def friedman_conover_comparison(df, var_name='model', value_name='anomaly', plot_name=None,
outdir=plots_dir):
"""
:param df: dataframe with the data to campare -models
:param var_name: str name of to group the variable - models
:param value_name: str name of the variable with the sults - response variable
:param plot_name: str name of the plot
:param outdir: str directory to save the plot
:return: pvalues and statitic for the comparison
Note: this code is adapted from scikit-posthoc tutorial
https://scikit-posthocs.readthedocs.io/en/latest/tutorial.html
"""
heatmap_args = {'linewidths': 0.5, 'linecolor': 'k', 'clip_on': False, 'square': True,
'cbar_ax_bbox': [0.85, 0.35, 0.04, 0.3]}
df_ = df.rename_axis('cv_fold').melt(var_name=var_name, value_name=value_name,
ignore_index=False).reset_index()
avg_rank = df_.groupby('cv_fold')[value_name].rank(pct=True).groupby(df_[var_name]).mean()
stat, p_value = ss.friedmanchisquare(*df.values.T)
if p_value < 0.05:
print(f'p_value : {str(p_value)}, we can reject the null hypothesis H0 with 95% certainty')
print(' ')
print('Post-hoc Conover-Friedman multiple comparison is applied')
significance = sp.posthoc_conover_friedman(df, p_adjust='holm')
sp.sign_plot(significance, **heatmap_args)
save_plot(outdir=outdir, plot_name=f'posthoc_{plot_name}')
plt.show()
print(' ')
plot_critical_difference(significance, avg_rank, plot_name=plot_name)
return stat, p_value, avg_rank
def plot_critical_difference(posthoc=None, ranks_df=None, plot_name=None, outdir=plots_dir):
"""
:param ranks_df: dataframe with ranks
:param posthoc: dataframe/ dict with the results from the posthoc comparison
:param plot_name: str name to save the plot
:param outdir: str path to save the plots
:return: show the critical difference plot
Note: this code is adapted from scikit-posthoc tutorial
https://scikit-posthocs.readthedocs.io/en/latest/tutorial.html
"""
plt.figure(figsize=(10, 2), dpi=100)
plt.title('Critical difference diagram of average score ranks')
sp.critical_difference_diagram(ranks_df, posthoc)
save_plot(outdir=outdir, plot_name=f'critical_dif_{plot_name}')
return plt.show()
def compare_clusters_metrics(df_raw, df_cluster):
"""
:param df_raw: raw data dataframe
:param df_cluster: dataframe with cluster labels
:return: data frame with the cluster metrics scores
"""
model_scores = {}
for col in df_cluster.columns:
sil = metrics.silhouette_score(df_raw, df_cluster[col])
cal_har = metrics.calinski_harabasz_score(df_raw, df_cluster[col])
dav_boul = metrics.davies_bouldin_score(df_raw, df_cluster[col])
model_scores[col] = [sil, cal_har, dav_boul]
cluster_metrics = pd.DataFrame.from_dict(model_scores).T
cluster_metrics.columns = ['silhoutte', 'calinski_harabasz', 'davies_bouldin']
return cluster_metrics