-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgibbs_sampling.py
154 lines (142 loc) · 5.88 KB
/
gibbs_sampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import random
from time import sleep
from tqdm import tqdm, trange
from termcolor import colored
def symbolToNumber(symbol): # symbol is a string of length 1
if symbol == "A":
return 0
if symbol == "C":
return 1
if symbol == "G":
return 2
if symbol == "T":
return 3
def numberToSymbol(x): # x is an integer
if x == 0:
return "A"
if x == 1:
return "C"
if x == 2:
return "G"
if x == 3:
return "T"
def profileRandom(k, profile, text): # k is an integer, profile is a list of lists of floats, text is a string
probs = []
for i in range(0,len(text) - k +1): # for each kmer in text
prob = 1.0
pattern = text[i:i+k] # pattern is the kmer
for j in range(k): # for each symbol in the kmer
l = symbolToNumber(pattern[j]) # l is the index of the symbol in the profile
prob *= profile[l][j] # prob is the product of the probabilities of the symbols in the kmer
probs.append(prob) # add the probability of the kmer to the list of probabilities
r = myRandom(probs) # r is the index of the kmer with the highest probability
return r
def profileForm(motifs): # motifs is a list of strings
k = len(motifs[0]) # k is the length of the first string in motifs
profile = [[1 for i in range(k)] for j in range(4)] # initialize profile to a list of lists of length k with all elements initialized to 1
for x in motifs: # for each string in motifs
for i in range(len(x)): # for each symbol in the string
j = symbolToNumber(x[i]) # j is the index of the symbol in the profile
profile[j][i] += 1 # add 1 to the count of the symbol in the profile
for x in profile: # for each list in profile
for i in range(len(x)): # for each element in the list
x[i] = x[i]/len(motifs) # divide the count of the symbol by the number of strings in motifs
return profile
def consensus(profile): # profile is a list of lists of floats
str = ""
for i in range(len(profile[0])): # for each element in the first list in profile
max = 0
loc = 0
for j in range(4): # for each element in the profile
if profile[j][i] > max: # if the element in the profile is greater than the current max
loc = j # set the location to the index of the element in the profile
max = profile[j][i] # set the max to the element in the profile
str+=numberToSymbol(loc) # add the symbol corresponding to the location to the string
return str
def score(motifs): # motifs is a list of strings
profile = profileForm(motifs) # profile is a list of lists of floats
cons = consensus(profile) # cons is a string
score = 0
for x in motifs: # for each string in motifs
for i in range(len(x)): # for each symbol in the string
if cons[i] != x[i]: # if the symbol in the consensus is different from the symbol in the string
score += 1 # add 1 to the score
return score
def myRandom(dist): # dist is a list of floats
s = 0.0
for x in dist: # for each element in dist
s+= x # add the element to s
i = random.random() # i is a random number between 0 and s
partial = 0.0
for x in range(len(dist)): # for each element in dist
partial += dist[x] # add the element to partial
if partial/s >= i: # if the partial sum divided by s is greater than or equal to i
return x
def gibbsSampler(dna, k, t, n): # dna is a list of strings, k,t and n are integers
bestMotifs = []
motifs = []
for x in range(len(dna)): # for each string in dna
i = random.randint(0, len(dna[x])-k) # i is a random integer between 0 and the length of the string minus k
motifs.append(dna[x][i:i+k]) # add the k-mer starting at index i to the motifs list
bestMotifs = motifs[:] # bestMotifs is a copy of motifs
for i in range(n): # for each iteration
j = random.randint(0,t-1) # j is a random integer between 0 and t-1
profile = profileForm(motifs[:j] + motifs[j+1:]) # profile is a list of lists of floats
r = profileRandom(k, profile, dna[j]) # r is a random k-mer from the profile
motifs[j] = dna[j][r:r+k] # set the jth element of motifs to the k-mer starting at index r
if score(motifs) < score(bestMotifs): # if the score of motifs is less than the score of bestMotifs
bestMotifs = motifs[:] # set bestMotifs to a copy of motifs
return bestMotifs
output = open('./output/Key1.txt','w')
i = 0
before_len = len("motif width is ")
k = 0
base = ['A', 'T', 'C', 'G']
dna = []
with open('./testing_data/Q1.txt') as f:
for line in f:
# print(line)
if not line.isspace():
temp = line.strip()
if i == 0 :
k = int(temp[before_len:])
elif temp[0] in base:
dna.append(temp)
i += 1
# output.write(f"dna = {dna}")
n = 1000 # iteration
t = len(dna)
best = gibbsSampler(dna, k, t, n)
s = score(best)
for x in tqdm(range(200)): # for each iteration of the gibbs sampler
sample = gibbsSampler(dna, k, t, n)
if score(sample) < s: # if the score of the sample is less than the score of the best
s = score(sample) # set s to the score of the sample
best = sample[:] # set best to a copy of the sample
# output = open('Q1.txt','w')
# Find the key
Key = str()
for i in range(k):
count = [0] * 4 # ACGT
for j in range(len(best)):
b = best[j][i]
count[symbolToNumber(b)] += 1
max_symbol = numberToSymbol(count.index(max(count)))
Key += max_symbol
output.write(f"Key = {Key}\n")
# Find the index of every motif
index = []
for i in range(len(dna)):
b = best[i]
index.append(dna[i].find(b))
# Print motif result
for i in range(len(dna)):
start_index = index[i]
output.write(f">Sequence{i + 1}(start at position {start_index + 1}, motif is {best[i]})")
output.write("\n")
output.write(dna[i][:start_index]+"'"+dna[i][start_index:start_index+k]+"'"+dna[i][start_index+k:])
output.write("\n")
# Print the best motif
for b in best:
print(b)
output.close()