-
Notifications
You must be signed in to change notification settings - Fork 2
/
model_III.py
264 lines (224 loc) · 9.77 KB
/
model_III.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing#, cross_validation
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from scipy import stats
import scipy.io
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from warnings import filterwarnings
filterwarnings('ignore')
sklearn.__version__
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#Functions
# Compute p-value using the chi-squared test for binary predictors
def chi2_cols(y,x):
'''
input:
y: 1-d binary label array
x: 1-d binary feature array
return:
chi2 statistic and p-value
'''
y_list=y.astype(int).tolist()
x_list=x.astype(int).tolist()
freq=np.zeros([2,2])
for i in range(len(y_list)):
if y_list[i]==0 and x_list[i]==0:
freq[0,0]+=1
if y_list[i]==1 and x_list[i]==0:
freq[1,0]+=1
if y_list[i]==0 and x_list[i]==1:
freq[0,1]+=1
if y_list[i]==1 and x_list[i]==1:
freq[1,1]+=1
y_0_sum=np.sum(freq[0,:])
y_1_sum=np.sum(freq[1,:])
x_0_sum=np.sum(freq[:,0])
x_1_sum=np.sum(freq[:,1])
total=y_0_sum+y_1_sum
y_0_ratio=y_0_sum/total
freq_=np.zeros([2,2])
freq_[0,0]=x_0_sum*y_0_ratio
freq_[0,1]=x_1_sum*y_0_ratio
freq_[1,0]=x_0_sum-freq_[0,0]
freq_[1,1]=x_1_sum-freq_[0,1]
stat,p_value=stats.chisquare(freq,freq_,axis=None)
return p_value#, stat
# Compute the variables statistics such as mean, std, p-value, and correlation
def stat_test(df, y):
'''
input:
df: 2-d dataframe of the dataset
y: 1-d binary label array
return:
a dataframe of the variables statistics such as mean, std, p-value, and correlation
'''
name = pd.DataFrame(df.columns,columns=['Variable'])
df0=df[y==0]
df1=df[y==1]
pvalue=[]
y_corr=[]
for col in df.columns:
if df[col].nunique()==2:
# Compute p-value using the chi-squared test for binary predictors
pvalue.append(chi2_cols( y,df[col]))
else:
# Compute p-value using the Kolmogorov-Smirnov test for continuous predictors
pvalue.append(stats.ks_2samp(df0[col], df1[col]).pvalue)
# Compute pairwise correlation of the variable and the label
y_corr.append(df[col].corr(y))
name['All_mean']=df.mean().values
name['y1_mean']=df1.mean().values
name['y0_mean']=df0.mean().values
name['All_std']=df.std().values
name['y1_std']=df1.std().values
name['y0_std']=df0.std().values
name['p-value']=pvalue
name['y_corr']=y_corr
return name.sort_values(by=['p-value'])
# Compute pairwise correlation of each continuous variable and the label
# and drop the variable with low correlation (<0.04)
def df_ycorr(df,col_y):
'''
input:
df: 2-d dataframe of the dataset
col_y: label name
return:
a list of variable names that we want to drop because of low correlation with outcome
'''
drop_cols=[]
for col in df.columns:
if df[col].nunique()!=2:
y_corr=round(df[col_y].corr(df[col]),2)
if (abs(y_corr)<0.04):
drop_cols.append(col)
return drop_cols
# Compute pairwise correlation of variables with each other
# and if the correlation is high (>0.8), we keep one variable of the highly-correlated variables
def high_corr(df, thres=0.8):
'''
input:
df: 2-d dataframe of the dataset
thres: Threshold we consider to determine highly correlated variables
return:
a list of pairs of two highly correlated variables
'''
corr_matrix_raw = df.corr()
corr_matrix = corr_matrix_raw.abs()
high_corr_var_=np.where(corr_matrix>thres)
high_corr_var=[(corr_matrix.index[x],corr_matrix.columns[y], corr_matrix_raw.iloc[x,y]) for x,y in zip(*high_corr_var_) if x!=y and x<y]
return high_corr_var
# a function to drop variables we want to drop
def df_drop(df_new, drop_cols):
'''
input:
df_new: 2-d dataframe of the dataset after preprocessing icluding one-hot encoding and statistical feature selection
drop_cols: a list variables we want to drop
return:
our dataframe after dropping variables we want to drop
'''
return df_new.drop(df_new.columns[df_new.columns.isin(drop_cols)], axis=1)
# step1: load dataset (dataset after pre-processing and one-hot encoding)
df_new=pd.read_csv('df_PRESTO_id.csv')
## apply limitations we want
### keep only participants with no more than one menstrual cycle of pregnancy attempt at study entry
df_new=df_new.loc[df_new['ttp_entry']<2,:].reset_index(drop=True)#.head()
### define the time variable for the survival model
df_new['time_variable'] = (df_new['TTP_exit']-df_new['ttp_entry']).astype(int)
### Model III pedict the probability of pregnancy within each menstrual cycle for up to 12 cycles of follow-up (fecundability model)
which_month=12
### some considerations
df_new['b_livebirths']=round(df_new['b_livebirths'])
for col in ['b_birthorder','b_livebirths']:
df_new.loc[df_new[col]>3,col]=3
for col in ['b_csectiontotal']:
df_new.loc[df_new[col]>2,col]=2
### drop columns we don't need
drop_cols=['b_finisheddate_year','persons','TTP_exit','newid']
df_new=df_drop(df_new, drop_cols)
## load variable descriptions to be added to the final predictors' coefficient tables
explain_Variables=pd.read_csv('variable_list_20200910.csv')
# step2: Statistical Feature Selection
## drop variables with low std (<0.0001)
col_y='pregnant'
df_std = df_new.std()
drop_cols = df_std[df_std<0.0001].index.values
df_new=df_drop(df_new, drop_cols)
## keep one var among high correlated vars with threshold=thres
high_corr_features = high_corr(df_new, thres=0.8)
drop_cols=['b_everpregnant', 'hxinfert', 'b_geog5_6.0','b_livebirths']#select one to drop among each pair in high_corr_list #for our model_II: drop_cols=['b_everpregnant', 'b_livebirths', 'hxinfert']
df_new=df_drop(df_new, drop_cols)
# step3: lifelines CoxPHFitter
from lifelines import CoxPHFitter
#scaling/normalizing
from sklearn import preprocessing
col_y = 'pregnant'
scaler = preprocessing.StandardScaler()#MinMaxScaler
X = df_new.drop([col_y, 'time_variable', 'ttp_entry'], axis=1)
name_cols=df_new.drop([col_y, 'time_variable', 'ttp_entry'], axis=1).columns.values
X = scaler.fit_transform(X)
df = pd.DataFrame(X, columns=name_cols)
df[col_y] = df_new[col_y]
df['time_variable'] = df_new['time_variable']
df['ttp_entry'] = df_new['ttp_entry']
## drop variables with high p-value (>0.05)
cph = CoxPHFitter()#penalizer=0.01, l1_ratio=1.0)
cph.fit(df=df, duration_col='time_variable', event_col='pregnant',strata='ttp_entry')#OR: duration_col='TTP_exit',entry_col='ttp_entry'
df_coef_cox = cph.summary
df_coef_cox['Variable'] = df_coef_cox.index
df_coef_cox['coef_abs'] = df_coef_cox['coef'].abs()
LRresult_ = pd.merge(df_coef_cox.sort_values(by=['coef_abs'],ascending=False), explain_Variables[['Variable','Label']], how='left', on=['Variable'])
LRresult_.drop(['coef_abs'], axis=1)
vars_small_pvalue = LRresult_[(LRresult_['p']<0.05)]['Variable'].tolist()
my_df = df[vars_small_pvalue + ['time_variable','pregnant','ttp_entry']].copy()
# compute the concordance index
#concordance_index = []
my_seeds=range(2016, 2021) # the random_state that controls the shuffling applied to the data before applying the split
for seed in my_seeds:
# split dataset to train-set and test-set
cph_train, cph_test = train_test_split(my_df, test_size=0.2, random_state=seed)
# training model on train-set
cph.fit(df=cph_train, duration_col='time_variable', event_col='pregnant',strata='ttp_entry')
# Evaluating cox model on test-set
#concordance_index.append(cph.score(cph_test, scoring_method='concordance_index'))
concordance_index = cph.score(cph_test, scoring_method='concordance_index')
#print("concordance_index is:" + str(sum(concordance_index) / len(concordance_index)))
print("concordance_index on test is:" + str(concordance_index))
# Feature Selection using train set
X = cph_train.copy()
y=X[["pregnant","time_variable", "ttp_entry"]]
X.drop(["pregnant","time_variable", "ttp_entry"], axis=1, inplace=True)
n_features = X.shape[1]
scores = np.empty(n_features)
m = CoxPHFitter()
for j in range(n_features):
Xj = X.iloc[:, j:j+1]
Xj=pd.merge(Xj, y, how='right', left_index=True, right_index=True)
m.fit(Xj, duration_col="time_variable", event_col="pregnant", strata='ttp_entry')#, show_progress=True)
scores[j] = m.score(Xj, scoring_method='concordance_index')
my_df_predictors = pd.Series(scores, index=X.columns).sort_values(ascending=False)
k=15
top15_predictors = my_df_predictors[0:k].index.tolist()
# Repeat with top15 predictors
my_top15_train = cph_train[top15_predictors + ['time_variable','pregnant','ttp_entry']].copy()
cph = CoxPHFitter()#penalizer=0.01, l1_ratio=1.0)
cph.fit(df=my_top15_train, duration_col='time_variable', event_col='pregnant',strata='ttp_entry')#OR: duration_col='TTP_exit',entry_col='ttp_entry'
df_coef_cox = cph.summary
df_coef_cox['Variable'] = df_coef_cox.index
df_coef_cox['coef_abs'] = df_coef_cox['coef'].abs()
LRresult_ = pd.merge(df_coef_cox.sort_values(by=['coef_abs'],ascending=False), explain_Variables[['Variable','Label']], how='left', on=['Variable'])
LRresult_ = LRresult_.drop(['coef_abs'], axis=1)
# Evaluating top15 cox model on test
top15_columns = my_top15_train.columns.tolist()
test_concordance_index = cph.score(cph_test[top15_columns], scoring_method='concordance_index')
print(test_concordance_index)
# model_III top variables
LRresult_