-
Notifications
You must be signed in to change notification settings - Fork 3
/
Vespa_class.py
238 lines (184 loc) · 8.86 KB
/
Vespa_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""
ETH Zurich | Gisela Gabernet
Simulated molecular evolution algorithm (SME)
"""
import sys
import numpy as np
import os
def save_fasta(filename, sequences, names=None):
"""Method for saving sequences (and names) into a fasta file.
:param filename: {str} output filename (ending .fasta)
:param sequences: {list} sequences to be saved to file
:param names: {list} whether sequence names from self.names should be saved as sequence identifiers
:return: a FASTA formatted file containing the generated sequences
"""
if os.path.exists(filename):
os.remove(filename) # remove outputfile, it it exists
with open(filename, 'w') as o:
for n, seq in enumerate(sequences):
if names:
o.write('>' + str(names[n]) + '\n')
else:
o.write('>Seq_' + str(n) + '\n')
o.write(seq + '\n')
def _boxmuller():
"""The Box-Muller transformation picks a number from a gaussian distribution centered in 0 and with sigma = 1
given two randomly generated numbers (i, j) in the interval [0, 1].
:return: {float} number picked from normal distribution with expectation 0 and sigma 1.
"""
i = np.random.random_sample(1) # picks random float [0, 1)
j = np.random.random_sample(1) # picks random float [0, 1)
return np.sqrt(-2.0 * np.log10(i)) * np.sin(2.0 * np.pi * j)
def mutate(seq, sigma, sigma_strategy, aa_matrix, aa_order, aa_excluded):
"""Mutates *parent* sequence according to probabilities derived from the provided matrixfile, which have been scaled
row-wise and using the boltzmann function with decay parameter sigma in the following way:
.. math::
P(i -> j) = exp(- dfrac{d_{ij}^2}{2 \sigma^2}) / sum_j exp(- dfrac{d_{ij}^2}{2 \sigma^2})
where :math:`d_{ij}`is the pair-wise amino acid distance in the provided matrix, scaled to a common interval to
obtain pseudo-probabilities for each residue position in a peptide, and :math:\sigma` is the provided sigma.
For more information refer to: J. A. Hiss & G. Schneider. Peptide Design by Nature-Inspired Algorithms, in
De novo Molecular Design. pp 444 - 448. 2014 Wiley-VCH.
:param seq: {str} sequence to mutate.
:param sigma: {float} sigma value for the mutation probability distribution.
:param aa_matrix: {np array} normalized aa similarity matrix.
:param aa_order: {np array} order of the amino acids in the aa similarity matrix.
:param aa_excluded: {list} list of the amino acids that are desired to be excluded from the offspring.
:return:
"""
mat = aa_matrix
aa_order = aa_order
offspring = []
dist_aa = 0
for p in range(len(seq)):
i = int(np.where(seq[p] == aa_order)[0]) # Translating the amino acid to a column index in the grantham matrix.
# Calculate the probability of mutation
probs = np.exp((-(mat[i, :] * mat[i, :]) / (2 * sigma * sigma)) /
np.sum(np.exp(-(mat[i, :] * mat[i, :]) / (2 * sigma * sigma))))
# Making probability of mutation to undesired aa in "no_aa" always 0.
mc0 = np.ones(20)
if len(aa_excluded)==1 and aa_excluded == '0':
pass
elif len(aa_excluded) >= 1:
for aa in aa_excluded:
idx_aa = int(np.where(aa == aa_order)[0])
mc0[idx_aa] = 0
else:
sys.exit("Non-desired aa argument invalid. Please type aa in 1-letter code without separation that "
"you wish to be excluded in the mutations, or type 0 if you don't wish to exclude any aa.")
probs = probs * mc0 / (np.sum(probs * mc0))
# Picking amino acid:
aa_mut = np.random.choice(aa_order, 1, p=probs)
aa_mut_idx = int(np.where(aa_mut == aa_order)[0])
offspring.extend(aa_mut)
dist_aa += (mat[i, aa_mut_idx]) ** 2
# Choosing offspring sigma.
if sigma_strategy == "Gaussian": # gaussian strategy with mean parent sigma and SD parent sigma.
sigma_off = abs(sigma + _boxmuller() * sigma)
elif sigma_strategy == "Thirds": # one-third strategy
sigma_off = np.random.choice([0.7 * sigma, sigma, 1.3 * sigma])
elif sigma_strategy == "Equal": # sigma remains constant
sigma_off = sigma
else:
sys.exit("Unknown Offspring sigma strategy, please choose from:\n"
"- Equal (default): sigma remains constant\n"
"- Gaussian (Gaussian distributed): offspring sigma gaussian distributed with SD seed sigma and "
"centered on seed sigma\n"
"- Thirds (one-third strategy): offspring sigma with probability 1/3 0.7*sigma, "
"with probability 1/3 sigma, with probability 1/3 1.3 * sigma\n")
return ''.join(offspring), np.sqrt(dist_aa), sigma_off
class SME(object):
def __init__(self, parent, n_offspring, sigma, matrix_file,
sigma_strategy="Equal", aa_excluded=None, random_seed=None):
"""Initialization of a new generation for a simulated molecular evolution iteration.
:param parent: {str} peptide sequence of the parent peptide.
:param offspring_number: {int} number of offspring for the generation.
:param n_generations: {int} number of generations.
:param sigma: {float} sigma value for the probability of mutation distance.
:param sigma_strategy: {str: Gaussian/Thirds} sigma strategy for the sigma evolution across generations.
:param fitness_function: {} fitness function to use as offspring selection.
:param matrix_file: {path to matrix file} amino acid similarity matrix, the first row needs to be
the amino acid order of the columns in the matrix.
:param aa_excluded: {list or string} amino acids to exclude in generating the mutations, in one letter code,
as string e.g. "CM" or list of strings e.g. ["C","M"].
"""
self.parent_first = parent
self.parent = parent
self.n_offspring = n_offspring
self.sigma_start = sigma
self.sigma = sigma
self.sigma_strategy = sigma_strategy
self.n_iterations = 0
if aa_excluded:
self.aa_excluded = aa_excluded
else:
self.aa_excluded = '0'
self.offspring = []
self.offspring_seq = []
if random_seed:
self.random_seed = random_seed
np.random.seed(random_seed)
else:
self.random_seed = None
if not os.path.exists(matrix_file):
sys.exit("\n Matrix file does not exist under provided path.")
aa_matrix = np.genfromtxt(matrix_file, delimiter='\t', dtype=float, skip_header=True)
self.aa_matrix = (aa_matrix - np.min(aa_matrix, axis=1)) / \
(np.max(aa_matrix, axis=1) - np.min(aa_matrix, axis=1))
self.aa_order = np.genfromtxt(matrix_file, delimiter='\t', max_rows=1, dtype=str)
def variation(self):
"""Method for producing a new iteration of simulated molecular evolution.
:return: Appended new offspring sequences, calculated distances and offspring sigma
for the iteration in attribute offspring.
"""
self.n_iterations += 1
# Mutating to new offspring, calculating distance and sigma.
off_generation = [(0, self.parent, 0, self.sigma)]
for i in range(1, self.n_offspring + 1):
off, dist, sigma = mutate(self.parent, self.sigma, self.sigma_strategy, self.aa_matrix, self.aa_order,
self.aa_excluded)
off_generation.append((i, off, dist, sigma))
self.offspring_seq.append(off)
self.offspring.append(off_generation)
def update_parent(self, parent):
"""Method to manually select a new parent for the following round of iteration.
:param parent: {str} sequence of the parent peptide.
:return: updated parent sequence in self.parent
"""
self.parent = parent
if self.parent not in self.offspring_seq:
print "\nWARNING: the selected parent sequence is not registered in the offspring\n"
def selection(self, fitness_function):
"""Method for selecting the best offspring according to a fitness function.
--> need to update parent and sigma in self.parent and self.sigma.
:param fitness_function: fitness function.
:return: updated parent and sigma in attibutes self.parent and self.sigma.
"""
def iterations(self, n, fitness_function):
"""Method to produce several SME iterations according to the provided fitness function.
--> need to update parent and sigma in self.parent and self.sigma.
:param n: number of iterations.
:param fitness_function: fitness function.
:return: Appended new offspring sequences, calculated distances and offspring sigma
for the iteration in attribute offspring.
"""
def save_offspring(self, filename):
"""Method to save the offspring sequences in a text file and fasta file.
:return: text file and fasta file with the offspring from all the iterations.
"""
with open(filename + ".txt", mode='w') as f:
f.write("\nVESPA v3\n"
"\nSeed: %s\nStrategy: (1, %i)\nSigma: %.2f\nOffspring sigma strategy: %s"
"\nUndesired aa: %s\n" % (self.parent_first, self.n_offspring, self.sigma_start,
self.sigma_strategy, ''.join(self.aa_excluded)))
if self.random_seed:
f.write("Random Seed: %i\n" % (self.random_seed))
names = []
seqs = []
f.write("\nName\tDist\tSigma\tSequence\n")
for i, iter in enumerate(self.offspring):
for o, off in enumerate(iter):
name = ''.join([str(i + 1), '.', str(o)])
names.append(name)
seqs.append(off[1])
f.write("%s\t%3.3f\t%.2f\t%s\n" % (name, off[2], off[3], off[1]))
save_fasta(filename + '.fasta', seqs, names)