-
Notifications
You must be signed in to change notification settings - Fork 2
/
Features.py
246 lines (195 loc) · 10.1 KB
/
Features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#! /usr/bin/env python
#author: bliebeskind (primary)
#from protein_complex_maps/protein_complex_maps/features/ExtractFeatures/
import numpy as np
import pandas as pd
import itertools as it
from functions import features, resampling
class Elut():
'''
Class to read and inspect a fractionaton masss-spec experiment
Infile must be wide format with the first column holding protein or orthogroup ids.
'''
def __init__(self,data=None):
### Could just extend the DataFrame object by inheriting it, doesn't feel right, though ###
# self._df = pd.DataFrame.__init__(self,data)
self.df = None
self.info = {}
if not data is None:
self.df = data
self._check_dtypes()
self._get_info()
self.is_normalized = False
self.is_thresholded = False
### Write attributes to hold data that can be shown with self.description. Also generate
### values for pseudocounts that go into poisson noise and kullback-leibler and jensen-shannon
### features
def _check_dtypes(self):
'''Make sure loaded DataFrame is only floats'''
try:
dtype_set = set(self.df.dtypes)
assert len(dtype_set) == 1
assert dtype_set.pop() is np.dtype("float")
except AssertionError:
raise Exception("Multiple datatypes or non-float datatypes found in input DataFrame")
def load(self,infile,format='tsv'):
'''Read in data as pandas dataframe in wide format'''
if not self.df is None:
raise Exception("data already loaded")
if format == 'csv':
self.df = pd.read_csv(infile,index_col=0).astype("float")
#kdrew: should this be tsv?
elif format == 'tsv':
self.df = pd.read_table(infile,index_col=0).astype("float")
else:
raise Exception("<format> must be either 'csv' or 'tsv'")
if 'TotalCount' in self.df.columns:
#raise Exception("Old elution file format, please update by removing 'TotalCount' column")
print("WARNING: 'TotalCount' column was found, file is in old elution format, dropping 'TotalCount' column")
self.df = self.df.drop(['TotalCount'], axis=1)
self._check_dtypes()
self._get_info()
def load_many(self,file_list):
'''Load a list of files and join together (by row) into a combined dataframe'''
## TBD
pass
def normalize(self,by=['column']):
'''Normalize the loaded dataframe by row or by column sums'''
assert not self.is_normalized, "DataFrame already normalized"
self._prenormed_df = self.df # save df in hidden var before normalizing
did_something = False
if 'column' in by:
self.df = self.df/self.df.sum()
did_something = True
if 'row_sum' in by:
self.df = self.df.div(self.df.sum(axis=1), axis=0)
did_something = True
if 'row_max' in by:
self.df = self.df.div(self.df.max(axis=1), axis=0)
did_something = True
if did_something == False:
raise Exception("Must specify 'row_max,' 'row_sum', or 'column'\n{}".format(by))
self.is_normalized = True
def threshold(self,thresh=5):
'''Drop rows with total PSMs < thresh'''
assert not self.is_thresholded, "DataFrame already thresholded"
self.df["sum"] = self.row_sums
len_before = len(self.df)
self.df = self.df[ self.df["sum"] >= thresh ]
self.df.drop("sum",axis=1,inplace=True)
print "Dropped {} rows".format(len_before - len(self.df))
self.is_thresholded = True
def make_tidy(self,just_return=False):
'''Use pandas melt to reshape dataframe into tidy format, with columns "ID, "FractionID," "Total_SpecCounts".
If <just_return> is set, it will return the tidy dataframe without changing the stored data '''
tidy = self.df.melt(id_vars="ID",value_vars=self.df.columns[1:],var_name="FractionID",value_name="Total_SpecCounts")
if just_return:
return tidy
self._wide_df = self.df # save df in hidden var before melting
self.df = tidy
def undo_tidy(self):
'''Resets data to wide format'''
self.df = self._wide_df
self._wide_df = None
def _get_info(self):
'''Output simple info on the stored elution profiles'''
if self.df is None:
print "No loaded data"
return
self.row_sums = self.df.sum(axis=1)
self.info["n_PSMs"] = self.row_sums.sum()
self.info["PSMs_per_row_stats"] = self.row_sums.describe()[1:].to_dict()
self.info["n_proteins"] = len(self.df)
self.info["n_fractions"] = len(self.df.columns)
class ElutFeatures(Elut,features.FeatureFunctions,resampling.FeatureResampling):
'''Class to extract a variety of quantitative features on each pair of
proteins/orthogroups in an elution experiment'''
### Notes:
### - To add features, put the name in this list, and then add a function to features.py
### with the same name but prepended with a single underscore.
available_features = ["pearsonR",
"spearmanR",
"spearmanR_weighted",
"jensen_shannon",
"covariance",
"euclidean",
"canberra",
"braycurtis",
"invbraycurtis",
"cosine",
"sum_difference"]
resampling_strategies = ["poisson_noise",
"bootstrap"]
def __init__(self,data=None):
Elut.__init__(self,data)
self.analysis_count = 0
self.analyses = []
def _to_df(self,df,feature_matrix,feature_string):
'''Turn the 1d output of pdist into a tidy DataFrame'''
## Return off diagonal by default - make this a flag? ##
square_df = pd.DataFrame( feature_matrix, columns=df.index.values, index=df.index )
square_df.sort_index(inplace=True)
square_df.sort_index(axis=1,inplace=True)
mask = np.triu( np.ones(square_df.shape),k=1 ).astype('bool')
off_diag = square_df.where(mask)
tidy_df = off_diag.stack().reset_index()
tidy_df.columns = ["ID1", "ID2", feature_string]
return tidy_df
def _average_resamples(self,df,feature_function,resample_function,iterations):
'''Average N resamples'''
## work on 1d pdist output
return ( sum( feature_function( resample_function(df,rep=i) ) for i in range(iterations) ) / iterations )
def extract_features(self,feature,resampling=False,iterations=False,threshold=False,normalize=False):
'''
Returns a DataFrame of features
Parameters:
`feature`: <str> Feature to extract. Must be one of the features available in self.available_features
`resampling`: <str> Whether to resample and average. Must be either poisson_noise or bootstrap
`iterations`: <int> Used with resampling. Number of iterations to resample for
`threshold`: <int or float> Remove rows with fewer than `threshold` spectral matches
`normalize`: <list of str> Whether to normalize the data. Can be one or several of: row_sum, row_max, column
'''
# Assign the function to extract features
feat = "_" + feature # because I'm hiding actual feature functions
assert hasattr(self,feat), "{} not in available features:\n{}".format(feature,ElutFeatures.available_features)
feat_func = getattr(self,feat) # lookup the relvant function
self._analysis = {"feature": feature} # initialize or reset dictionary to hold information on which analysis was done
# Assign and execute normalization functions
## Currently in a hacky state ##
if threshold:
self.threshold(threshold)
try:
assert len(self.df) > 0
except AssertionError:
raise Warning("No rows left after thresholding")
return None
self._analysis["threshold"] = "thresh"+str(threshold)
if normalize:
self.normalize(normalize)
self._analysis["normalize"] = 'norm' + "".join(normalize)
if feature in ["jensen_shannon","kullback_leibler"]:
print "Adding pseudocounts and re-normalizing for JS or KL divergence"
self.df = self.df + 1
self.normalize(by='row_max')
## Don't like this part, because it could add pseudocounts twice if jensen-shannon + poisson
if resampling == "poisson_noise": # this is how Traver's function does it, for some reason
self.df = self.df + (1/len(self.df))
# Assign resampling function and return averaged DataFrame
if resampling:
assert iterations > 1, "if resampling, must specify more than 1 iteration"
respl = "_" + resampling
assert hasattr(self,respl), "{} not in available resampling strategies:\n{}".format(feature,resampling_strategies)
respl_func = getattr(self,respl)
feature_matrix = self._average_resamples(self.df,feat_func,respl_func,iterations)
self._analysis["resampling"] = resampling
self._analysis["reps"] = str(iterations)+"reps"
else:# If no resampling execute feature function and return DataFrame
feature_matrix = feat_func(self.df)
# Store information on what analysis was done
a_list = []
for a in ["feature", "resampling", "reps", "threshold", "normalize"]:
if a in self._analysis:
a_list.append(self._analysis[a])
self.analyses.append( "_".join(a_list) )
self.analysis_count += 1
return self._to_df(self.df,feature_matrix,feature)