-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain_test_model.py
73 lines (55 loc) · 2.72 KB
/
train_test_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import sklearn.metrics
from sklearn.externals import joblib
### BJL: paste necessary imports from TPOT here
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount
### BJL: training and test data are csv files with first two columns as IDs and label column called "label")
### all other columns should be features.
training_infile = "/project/bjl786/Synaptosome/Mouse/features/Mouse_features_07142018.train_labeled.csv"
test_infile = "/project/bjl786/Synaptosome/Mouse/features/Mouse_features_07142018.test_labeled.csv"
### BJL: outfiles
serialized_trained_model_outfile = "tpot_07142018_fitted_model.p"
pr_curve_outfile = #"tpot_07142018_test_PRC.csv"
results_df_outfile = #"tpot_07142018_test_resultsDF.csv"
index_cols=[0,1]
##### BJL: paste TPOT pipeline here
exported_pipeline = make_pipeline(
ZeroCount(),
VarianceThreshold(threshold=0.0005),
ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.1, min_samples_leaf=3, min_samples_split=5, n_estimators=100)
)
##### Don't change anything below here
assert os.path.exists(training_infile), "{} not found".format(training_infile)
assert os.path.exists(test_infile), "{} not found".format(test_infile)
print("Reading training data")
train = pd.read_csv(training_infile, index_col=[0,1])
train_label = train.pop("label").values
train_data = train.values
print("Training model")
exported_pipeline.fit(train_data, train_label)
joblib.dump(exported_pipeline, serialized_trained_model_outfile)
train_fit_probs = exported_pipeline.predict_proba(train_data)[:,1]
train_aps = sklearn.metrics.average_precision_score(train_label,train_fit_probs)
print("Training set average precision score: {}".format(train_aps))
del train
del train_data
print("Reading test data")
test = pd.read_csv(test_infile, index_col=[0,1])
test_label = test.pop("label")
test_data = test.values
test_probs = exported_pipeline.predict_proba(test_data)[:,1]
test_aps = sklearn.metrics.average_precision_score(test_label, test_probs)
print("Test set average precision score: {}".format(test_aps))
test_p, test_r, _ = sklearn.metrics.precision_recall_curve(test_label, test_probs)
test_PRC = pd.DataFrame({"precision": test_p, "recall": test_r}).sort_values("recall")
test_PRC.to_csv(pr_curve_outfile,index=False)
test_DF = pd.DataFrame({"label":test_label,"P_1":test_probs}, index=test.index).sort_values("P_1", ascending=False)
test_DF["FDR"] = 1 - (test_DF.label.cumsum() / (np.arange(test_DF.shape[0]) + 1))
test_DF.to_csv(results_df_outfile)