-
Notifications
You must be signed in to change notification settings - Fork 0
/
CompsML.py
94 lines (75 loc) · 3.08 KB
/
CompsML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import sys
sys.path.append('..')
from lib import *
import pandas as pd
import numpy as np
'''
Input: lsa_output_datapath = datapath .npy saved LSA matrix, email_clean = dataframe containing cleaned email data
Ouput: dataframe contaning LSA values and email data
Combine LSA matrix and email data
'''
def setup_dataframe(lsa_output_datapath, email_clean, user_input = False):
lsa_np = np.load(lsa_output_datapath)
lsa_df = pd.DataFrame(lsa_np)
metadata = email_clean
metadata = metadata.reset_index(drop=True)
full_df = pd.concat([metadata, lsa_df], axis=1, join_axes=[metadata.index])
# full_df = full_df.loc[full_df['Label'] != '-1']
full_df = full_df.reset_index(drop=True)
cat_features = ['To','From']
features = list(range(100))
features.extend(cat_features + ['Date'])
if user_input:
full_df = full_df[features + ['Label'] + ['ID'] + ['Relevant']]
else:
full_df = full_df[features + ['Label'] + ['ID']]
return full_df
'''
Input: full_dataframe = output from setup_dataframe()
Output: None
Use full dataframe to train tree and save it.
'''
def train_tree(full_dataframe, user_input = False):
n_trees = 32
tree_depth = 70
random_seed = None
n_max_features = 90
n_max_input = full_dataframe.shape[0]
benchmark = None
rows = range(full_dataframe.shape[0])
cat_features = ['To', 'From']
forest = RNF(full_dataframe, n_trees, tree_depth, random_seed, n_max_features, n_max_input, cat_features, user_input=user_input)
forest.fit_parallel()
forest.store_rnf('scenario_full_train.pickle')
#Input: tree_datapath = trained tree, test_dataframe = output from setup_dataframe with test data
#Output: None
#Evaluate tree on test data
def test_tree(tree_datapath, test_dataframe, user_input = False):
test_forest = RNF(None, None, None, None, None, None, None, user_input=user_input)
test_forest.load_rnf(tree_datapath)
predictions = test_forest.predict_parallel(test_dataframe)
stats = evalStats(predictions[1], test_dataframe, user_input)
evalDict = {}
evalDict["Recall"] = str(stats[0] * 100) + "%"
evalDict["Precision"] = str(stats[1] * 100) + "%"
evalDict["Accuracy"] = str(stats[2] * 100) + "%"
evalDict["F1"] = str(stats[3] * 100) + "%"
print("Recall:" + str(stats[0] * 100) + "%")
print("Precision:" + str(stats[1] * 100) + "%")
print("Accuracy:" + str(stats[2] * 100) + "%")
print("F1:" + str(stats[3]))
return evalDict
def test_tree_frontend(test_forest, test_dataframe):
predictions = test_forest.predict_parallel(test_dataframe)
stats = evalStats(predictions[1], test_dataframe, user_input = True)
evalDict = {}
evalDict["Recall"] = str(stats[0] * 100) + "%"
evalDict["Precision"] = str(stats[1] * 100) + "%"
evalDict["Accuracy"] = str(stats[2] * 100) + "%"
evalDict["F1"] = str(stats[3] * 100) + "%"
print("Recall:" + str(stats[0] * 100) + "%")
print("Precision:" + str(stats[1] * 100) + "%")
print("Accuracy:" + str(stats[2] * 100) + "%")
print("F1:" + str(stats[3]))
print("Gold Standard")
return evalDict