forked from laurentnoe/iedera
-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_mow_seeds.py
executable file
·194 lines (159 loc) · 7.41 KB
/
plot_mow_seeds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python3
import os,sys,re
def fact(n):
res = 1
for i in range(2,n+1):
res *= i
return res
def binomial(n, k):
return fact(n) // (fact(k) * fact(n - k))
from fractions import Fraction
# Model "file_4iupac" is provided below
# - Final state 0 (final = 1) is not used ... transitions for the 16 letters are absent (0) : this is not important, since unused
# - Initial state 1 (final = 0) is self-looping with one transition per letter, and with probabality associated to the transition matrix
# A C G T
# A 3x xp xp xp
# C xp 3x xp xp
# G xp xp 3x xp
# T xp xp xp 3x
#
#
# but since its integer based computation and the sum is 12 (and not 1), this has to be divided by 12**alignment_length after
# the full computation (one day i will put a rational coefficient "c++" template, this would help :oD )
#
file_4iupac = """
2 0 1 0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
1 0 0 1 1 3 * x
1 1 1 xp
2 1 1 xp
3 1 1 xp
4 1 1 xp
5 1 1 3 * x
6 1 1 xp
7 1 1 xp
8 1 1 xp
9 1 1 xp
10 1 1 3 * x
11 1 1 xp
12 1 1 xp
13 1 1 xp
14 1 1 xp
15 1 1 3 * x
"""
def write_model(filename):
if not os.path.exists(filename):
with open(filename,"wt") as f:
f.write(file_4iupac)
# (1) Extracting seed coeeficients from iedera output
def seeds_coefficients(seed_pattern = "NNNnNnnNnNnnNNnNNN", alignment_length = 64):
"""
Extracting the iedera polynomial coefficients or counts from iedera (they are only given with the "-pF" option applied)
This function runs "iedera (>= v1.7)" on the given "seed pattern (iupac notation)" with an "alignment_length", and reads its output.
It returns a tuple of 2 elements, each element being a list of len = alignment_length + 1 :
- the first list gives the number of alignments with "i" matches (index i) detected by the iupac seed
- the second list gives the frequency of "alignments detected by the seed" / "alignments in total"
@param str : the seed pattern being used, as a iedera string on the 27 iupac ("ACGTRrYySsWwKkMmBbDdHhVvn@N") extended symbols for seeds, used by "iedera --iupac")
@param int : the alignment length
@return tuple(list<Fraction>,list<Fraction>) : the tuple with two lists
"""
write_model("file_4iupac.txt")
err = os.system('./src/iedera -iupac -l '+str(alignment_length)+' -s 1,64 -m "'+seed_pattern+'" -pF file_4iupac.txt -o iedera_output.txt')
if err != 0:
sys.exit("./src/iedera not running (please try to run \"./configure; make\" before)")
file = open('iedera_output.txt','r')
for line in file:
matches_pos = re.findall('([0-9]*) \* x \^ ([0-9]*)', line, re.DOTALL)
# transform tuples of strings into tuples of int
if matches_pos != []:
matches_pos_indiced = [0 for _ in range(alignment_length+1)]
matches_pos_indiced_frac = [0 for _ in range(alignment_length+1)]
# transform each tuple by its pos --> value, AND divide by 12**alignment_length, since all the probs per state are summing to 12 ;-)
for c1,c2 in matches_pos:
matches_pos_indiced [int(c2)] = Fraction(int(c1),12**alignment_length)
matches_pos_indiced_frac[int(c2)] = Fraction(int(c1),12**alignment_length * binomial(alignment_length,int(c2)))
file.close()
os.remove('iedera_output.txt')
return matches_pos_indiced,matches_pos_indiced_frac
# (2) Computation of probabilities on rational numbers
def poly_prob_frac(pos_coef,x):
"""
Compute the polynomial \sum_i pos_coef[i] * x ^ i * (1-x) ^ (n-i)
{where n = len(pos_coef) + 1}
as a rational number (Fraction)
@param list<Fraction> : the list of integers coefficients for the polynomial
@param Fraction : the value for x as a rational number (Fraction)
"""
value = Fraction(0)
n = len(pos_coef)
for i in range(len(pos_coef)):
value += pos_coef[i] * (x)**i * (1-x)**(n - i - 1)
return value
# print polynomial
def poly_str(pos_coef):
"""
Give the the polynomial \sum_i pos_coef[i] * x ^ i * (1-x) ^ (n-i) as a string
@param list<int> : the list of integers coefficients for the polynomial
@return str : the representation of this polynomial
"""
value = ""
n = len(pos_coef)
for i in range(len(pos_coef)):
value += str(pos_coef[i]) + " * x^" + str(i) + " * (1-x)^" + str(n - i - 1)
if i < len(pos_coef)-1:
value += " + "
return value
# (3) plotting scripts for one seed
delta_range = 1000
def plot_for_seed(plt = None, seed_pattern = "NNNnNnnNnNnnNNnNNN", color_plot = 'gray', alignment_length = 64):
"""
Plot (and compute before ...) on the current plt the frequency and probability for the seed to hit
"""
y_pos_coef,y_pos_frac = seeds_coefficients(seed_pattern,alignment_length)
# (1) Frequency version on rational number values
x_rational_i1 = [(i1/alignment_length) for i1 in range(alignment_length+1)]
plt.plot(x_rational_i1, y_pos_frac, marker='.',linestyle = 'None', color=color_plot, label=''+seed_pattern+' frequency')
# (2) Probabilities on rational numbers values
x_rational_i2 = [Fraction(i2,delta_range) for i2 in range(delta_range+1)]
y_rational_poly = [poly_prob_frac(y_pos_coef,x_rational) for x_rational in x_rational_i2]
plt.plot(x_rational_i2, y_rational_poly, color=color_plot, label=''+seed_pattern+' probability')
print(seed_pattern+" : "+poly_str(y_pos_coef))
# MAIN PROGRAMM
import matplotlib.pyplot as plt
alignment_length = 64
# Check arguments
if len(sys.argv) <= 1:
# Compute the plots for 3 test seeds
plot_for_seed(plt, seed_pattern = "NNNNNnnNNNnNNnNnnNnNNNnNNNN", color_plot = 'lightgray', alignment_length = alignment_length) # "NNNNNnnNNNnNNnNnnNnNNNnNNNN" is of weight 19
plot_for_seed(plt, seed_pattern = "RYNNNNNNNNNNNN", color_plot = 'darkblue', alignment_length = alignment_length) # weight 13 but with a ~ "1/4 x 1/4" "index x query" sparsity (so at least ~ "weight 15" considered)
plot_for_seed(plt, seed_pattern = "RYNNNNnnNNNnnNNNNN", color_plot = 'darkorange', alignment_length = alignment_length) # (same)
print("\033[93m This is an example : put your seeds as command line arguments !!! \033[0m")
print("\033[93m $"+sys.argv[0]+" \"RYNNNNNNNnnnNNNNN\" \"RYNNNNnnNNNnnNNNNN\" \"RRYNYNNNNNNNNNN\" \033[0m")
else:
# Compute the plots for several given seeds
# (https://matplotlib.org/stable/gallery/color/named_colors.html)
colors_plot = ['darkblue','darkorange','gold','orangered','grey','seagreen']
for i in range(1,len(sys.argv)):
plot_for_seed(plt, seed_pattern = sys.argv[i], color_plot = colors_plot[(i-1) % len(colors_plot)], alignment_length = alignment_length)
# Plotting all that stuff
plt.xlim([0,1])
plt.xlabel("alignment match frequency/probability")
plt.ylim([-0.01,+1.01])
plt.ylabel("1st hit frequency/probability")
plt.title("1st hit frequency/probability (alignment length = "+str(alignment_length)+")")
plt.legend(loc="upper left")
plt.show()