-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathccm_coda.py
315 lines (242 loc) · 10.5 KB
/
ccm_coda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
"""
Source code for kernel variable selection of compositional data via amalgamation.
Example usage is provided at the bottom of this file.
Note: We borrowed some codes from https://github.com/Jianbo-Lab/CCM
The name of ccm comes from "conditional covariance minimization."
"""
import numpy as np
import pandas as pd
import kernel
from scipy.spatial.distance import pdist
import tensorflow as tf
def center(X):
"""Returns the centered version of the given square matrix, namely
(I - (1/n) 1 1^T) X (I - (1/n) 1 1^T)
= X - (1/n) 1 1^T X - (1/n) X 1 1^T + (1/n^2) 1 1^T X 1 1^T.
Args:
X: An (n, n) matrix.
Returns:
The row- and column-centered version of X.
"""
mean_col = tf.reduce_mean(X, axis=0, keepdims=True)
mean_row = tf.reduce_mean(X, axis=1, keepdims=True)
mean_all = tf.reduce_mean(X)
return X - mean_col - mean_row + mean_all
def project(v, z):
"""Returns the Euclidean projection of the given vector onto the positive
simplex, i.e. the set
{w : \sum_i w_i = z, w_i >= 0}.
Implements the core given in Figure 1 of Duchi et al. (2008) [1].
[1] http://stanford.edu/~jduchi/projects/DuchiShSiCh08.pdf
Args:
v: A vector, clipped in advance (via clip_and_project function)
z: The desired sum of the components. Must be strictly positive.
Returns:
The Euclidean projection of v onto the positive simplex of size z.
"""
assert z > 0
z = tf.convert_to_tensor(z, dtype=tf.float32)
mu = tf.sort(v, direction='DESCENDING')
mu_cumsum = tf.cumsum(mu)
max_index = tf.where(tf.greater(mu * np.arange(1, len(v) + 1), mu_cumsum - z))[-1][0]
temp = tf.cast(max_index + 1, dtype=tf.float32)
theta = (mu_cumsum[max_index] - z) / temp
return tf.maximum(v - theta, 0)
@tf.function
def clip_and_project(w, num_features):
"""
clip and project w onto the l1-ball, sum(w) <= num_features
makes sure the w have values inside the interval [0, 1]
"""
w = tf.clip_by_value(w, 0, 1)
# If sum(w) > num_features, project to the l1-ball
if tf.greater(tf.reduce_sum(w), num_features):
w = project(w, num_features)
return w
class CCM(object):
"""
Initialization and evaluation of the objective loss function (after continuous relaxation)
"""
def __init__(
self, X, Y, num_features, transform_Y,
ker="gaussian", init=None):
"""
:param X: Input data (np.ndarray or pd.DataFrame)
:param Y: Label of data (np.ndarray)
:param transform_Y: None, "one-hot", "binary" (one of these three)
:param ker: Choice of kernels (gaussian, laplacian, vonMises, ...)
:param init: Customized initialization of gradient descent (use this if you want to apply other initializations)
"""
assert isinstance(X, np.ndarray) or isinstance(X, pd.DataFrame)
assert (np.abs(X.sum(axis=1) - 1) < 1e-8).all(), 'X is not compositional'
assert isinstance(Y, np.ndarray)
assert X.ndim == 2
assert Y.ndim == 1
X = np.array(X)
n, d = X.shape
assert Y.shape == (n,)
self.d = d
self.n = n
# X as a class element
self.X = X
# Deriving kernel parameters
self.sigma = tf.Variable(np.median(pdist(X)) / np.sqrt(2), dtype=tf.float32, trainable=False)
self.kernel_X = kernel.get_kernel(ker, self.sigma)
# Preprocess for Y
assert transform_Y in (None, "binary", "one-hot")
if transform_Y == "binary":
values = sorted(set(Y.ravel()))
assert len(values) == 2
Y_new = np.zeros(n)
# Y is going to be centered
Y_new[Y == values[0]] = -1
Y_new[Y == values[1]] = 1
Y = Y_new
elif transform_Y == "one-hot":
values = sorted(set(Y.ravel()))
Y_new = np.zeros((n, len(values)))
for i, value in enumerate(values):
Y_new[Y == value, i] = 1
Y = Y_new
if Y.ndim == 1:
Y = Y[:, np.newaxis]
# Column-wise centering.
self.Y = tf.convert_to_tensor(Y - Y.mean(0), dtype=tf.float32)
# Weight variable w.r.t. projected gradient descent
self.w = tf.Variable(project(tf.ones(d), num_features), dtype=tf.float32,
constraint=lambda x: clip_and_project(x, num_features))
if isinstance(init, np.ndarray):
assert init.size == d
self.w.assign(init)
elif init == "zero":
self.w.assign(np.zeros(d))
# Empirical estimate of Tr[Cond_cov], i.e., the loss of the problem
def __call__(self, epsilon, D_approx=None):
# Amalgamation of compositional data X_w
amalgam = tf.concat([self.X * self.w,
(1 - tf.math.reduce_sum(self.X * self.w, axis=1))[:, tf.newaxis]], axis=1)
# No RFF approximation
if D_approx is None:
G_X_w = center(self.kernel_X(amalgam))
G_X_w += self.n * epsilon * tf.eye(self.n)
G_X_w_inv = tf.linalg.inv(G_X_w)
# RFF approximation
# Some dtype mismatches live here.. to be modified.
else:
U_w = self.kernel_X.random_features(amalgam, D_approx)
V_w = tf.linalg.matmul(
tf.subtract(
tf.eye(self.n, dtype=tf.float32),
tf.divide(
tf.ones((self.n, self.n), dtype=tf.float32),
tf.constant(float(self.n), dtype=tf.float32))),
U_w)
# eq. 21, arXiv:1707.01164, omitting constant term
G_X_w_inv = tf.linalg.matmul(
tf.scalar_mul(
tf.constant(-1.0, dtype=tf.float32),
V_w),
tf.linalg.matmul(
tf.linalg.inv(
tf.add(
tf.linalg.matmul(
V_w,
V_w,
transpose_a=True),
tf.multiply(
tf.constant(self.n * epsilon, dtype=tf.float32),
tf.eye(D_approx, dtype=tf.float32)))),
V_w,
transpose_b=True))
return tf.linalg.trace(tf.linalg.matmul(self.Y, tf.linalg.matmul(G_X_w_inv, self.Y), transpose_a=True))
def train_step(model, optimizer, epsilon=0.1, D_approx=None):
with tf.GradientTape() as tape:
loss = model(epsilon=epsilon, D_approx=D_approx)
gradients = tape.gradient(loss, [model.w])
optimizer.apply_gradients(zip(gradients, [model.w]))
return loss, model.kernel_X.sigma
def ccm(X, Y, num_features, type_Y, epsilon, learning_rate=0.001,
iterations=1000, D_approx=None, verbose=True, init=None, kernel="gaussian",
optimizer=None, continuation=False):
"""
This function carries out variable selection of compositional data.
Args:
X: An (n, d) numpy array or pandas DataFrame
Y: An (n,) numpy array.
num_features: int. Number of selected features.
type_Y: str. Type of the response variable.
Possible choices: 'ordinal','binary','categorical','real-valued'.
learning_rate: learning rate of projected gradient method. Used if optimizer=None.
iterations: number of iterations for optimization.
D_approx: optional, rank-D kernel approximation via RFF
verbose: print the loss every 100 updates.
init: initialization of the gradient descent. By default, it is the center of the simplex.
Possible choices: None, "zero", or customized np.ndarray.
kernel: choice of kernels (gaussian, laplacian, vonMises, ...)
optimizer: Plain GD as default.
Possible choices: Adam, or AMSGrad.. tf.keras.optimizers objects for acceleration.
continuation: continuation method to avoid local minima (not recommended)
Return:
ranks: An (d,) numpy array, indicates feature ranks
w: resulting weights
"""
assert type_Y in ('ordinal', 'binary', 'categorical', 'real-valued')
if type_Y == 'ordinal' or type_Y == 'real-valued':
transform_Y = None
elif type_Y == 'binary':
transform_Y = 'binary'
elif type_Y == 'categorical':
transform_Y = 'one-hot'
fs = CCM(X, Y, num_features, transform_Y, ker=kernel, init=init)
if continuation:
# fs.sigma.assign(1.2 * fs.sigma)
sigma_evolution = fs.sigma * 1. / (tf.sqrt(2.0) * iterations)
if optimizer is None:
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
# For several training objects
train = tf.function(train_step)
# Training
for i in range(1, iterations + 1):
loss, sigma = train(fs, epsilon=epsilon, optimizer=optimizer, D_approx=D_approx)
if verbose and i % 100 == 0:
print("\niteration", i, "loss", loss.numpy(), end='')
print("\nweights:")
print(fs.w.numpy())
if continuation:
print(" // sigma:", sigma.numpy(), end='')
# Continuation
if continuation:
fs.sigma.assign_sub(sigma_evolution)
# If you want to perform early stopping, write down that code here
print("\nTrain finished")
# Results
w = fs.w.numpy()
# if verbose:
# print('The weights on features are: ', w)
idx = np.random.permutation(fs.d)
permutated_weights = w[idx]
permutated_ranks = (-permutated_weights).argsort().argsort() + 1
ranks = permutated_ranks[np.argsort(idx)]
return ranks, w
if __name__ == "__main__":
"""
Example usage: selection of differentially abundant variables
See ./LassoSelbal/toy.R for results of log-ratio methods
"""
a1, a2 = np.ones(10), np.ones(10)
# Signal at indices 6, 7
a2[6:8] += 2
# Generate compositional data with differential abundance
np.random.seed(77)
X1, Y1 = np.random.dirichlet(a1, 25), np.zeros(25)
X2, Y2 = np.random.dirichlet(a2, 25), np.ones(25)
X = np.r_[X1, X2]
Y = np.r_[Y1, Y2]
print(X)
print(Y)
# Perform variable selection
# learning rate is chosen to exhibit convergence of loss clearly.
rank, w = ccm(X, Y, num_features=2, type_Y="binary", epsilon=0.001,
learning_rate=3e-5, iterations=2000)
print("Selected variables:", np.sort(np.argsort(rank)[:2])) # selects true variables well
print("Weights:", w)