-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsmilefinder.py
330 lines (257 loc) · 10.5 KB
/
smilefinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/python
import sys
import random
import numpy
import scipy.stats
from collections import deque
import time
import math
from operator import itemgetter, attrgetter
class BinarySearch:
NOT_FOUND = False
def binarySearch(self, list, value, left, right):
if right < left:
return self.NOT_FOUND
mid = (left + right) / 2
if right - left == 1:
#if list[mid-1] < value <= list[mid]:
return right
#if list[mid] <= value < list[mid+1]:
# return mid
if value > list[mid]:
#return self.binarySearch(list, value, mid+1, right)
return self.binarySearch(list, value, mid, right)
elif value <= list[mid]:
#return self.binarySearch(list, value, left, mid - 1)
return self.binarySearch(list, value, left, mid)
#else:
# return mid
def search(self, list, value):
left = 0
right = len(list) - 1
return self.binarySearch(list, value, left, right)
start = time.clock()
print 'What is the complete name of the input file ?'
input = raw_input('-->')
input = input[:-1]
inputFH = open(input, 'r')
print 'Choose the name of the report file?'
reportFH = raw_input('-->')
reportFH = reportFH[:-1]
report = open(reportFH, 'w')
outputFH = open('SmileFinderCompleteTable.csv', 'w')
outputFH.write('SNP\tChromosome\tPosition\t'+ 'Max window He1'+ '\t' + 'He1 Percentile'+ '\t '+ 'Maxwindow He2'+
'\t'+ 'He2 Percentile' + '\t'+ 'Max window Fst' + '\t'+ 'Fst Percentile' + '\n')
print "How many windows ?"
numberOfWindows = int(raw_input('-->'))
print "What size for the shortest window ?"
startSize = currentSize = int(raw_input('-->'))
windowIncrement = 2
maxWindowSize = startSize + windowIncrement * (numberOfWindows - 1)
print "How much resampling ?"
resampling = int(raw_input('-->'))
print "Choose the sensitiviy"
sensitivity = raw_input('-->')
sensitivity = float(sensitivity[:-1])
print "Info:"
print "Number of windows: " + str(numberOfWindows)
print "Start size (min win size): " + str(startSize)
print "Window increment: " + str(windowIncrement)
print "Maximum window size: " + str(maxWindowSize)
print "Resampling value: " + str(resampling)
#Creating windows
windows = []
while len(windows) < numberOfWindows:
windows.append(currentSize)
currentSize += windowIncrement
# skip the header of input file and read it to memory
inputFH.readline()
#Getting the SNPs and values
allSNPs = []
for line in inputFH:
snpData = line.split('\t')#character used to split might need to be changed according to the format of the input file
#print snpData
allSNPs.append([str(snpData[0])+','+str(snpData[1])+','+str(snpData[2]), snpData[7], snpData[12], snpData[13]]) #6,13,17 for Hexpected, 7,14,17 for Hobserved, 7,12,13 for Selsim
SIZE = float(len(allSNPs))
#Start Sampling
currentWindow = {}
sampled = {}
counter = 0
for item in allSNPs:
for window in windows:
avgSnp = [0,0,0]
if window in currentWindow:
currentWindow[window][0].append(item[0])
currentWindow[window][1].append(float(item[1]))
currentWindow[window][2].append(float(item[2]))
currentWindow[window][3].append(float(item[3]))
#print currentWindow[window][0]
else:
currentWindow[window] = [deque([item[0]]), deque([float(item[1])]),
deque([float(item[2])]),
deque([float(item[3])])]
if len(currentWindow[window][0]) == window:
avgSnp[0] = sum(currentWindow[window][1]) / window
avgSnp[1] = sum(currentWindow[window][2]) / window
mean_avgSnp_2 = sum(currentWindow[window][3]) /window
sq_diff = 0
for elem in currentWindow[window][3]:
sq_diff += (elem - mean_avgSnp_2)**2 / (window -1)
avgSnp[2] = sq_diff
TargetIndex = window /2
Target = currentWindow[window][0][TargetIndex]
#print Target + '\t' + str(avgSnp[0])
rm = currentWindow[window][0].popleft()
rm = currentWindow[window][1].popleft()
rm = currentWindow[window][2].popleft()
rm = currentWindow[window][3].popleft()
#print sampled
if Target in sampled:
sampled[Target].append([window, avgSnp[0], avgSnp[1], avgSnp[2]])
else:
sampled[Target] = [[window, avgSnp[0], avgSnp[1], avgSnp[2]]]
counter += 1
if counter % 1000 == 0:
percentDone = (counter/ SIZE) * 100
print "Sampling Progress: "+ str(percentDone) + "%"
print 'Done in ' + str(time.clock() - start)
#sampled_out = open('sampled_out', 'w')
#for Target in sampled:
# sampled_out.write(str(Target)+ '\t'+ str(sampled[Target])+'\n')
#print sampled
del currentWindow
#Start Resampling
currentWindow = {}
max = len(allSNPs) - 1
counter = 0
resampled = {}
for i in xrange(0, resampling):
for window in windows:
rdm_avgSnp = [0,0,0]
currentWindow[window] = [[], [], []]
while len(currentWindow[window][0]) < window:
rdm = numpy.random.randint(0, max)
rdmSnp = allSNPs[rdm]
currentWindow[window][0].append(float(rdmSnp[1]))
currentWindow[window][1].append(float(rdmSnp[2]))
currentWindow[window][2].append(float(rdmSnp[3]))
rdm_avgSnp[0] = sum(currentWindow[window][0]) / window
rdm_avgSnp[1] = sum(currentWindow[window][1]) / window
#rdm_avgSnp[2] = sum(currentWindow[window][2]) / window
mean_rdm_avgSnp_2 = sum(currentWindow[window][2]) /window
sq_diff = 0
for elem in currentWindow[window][2]:
sq_diff += (elem - mean_avgSnp_2)**2 / (window -1)
rdm_avgSnp[2] = sq_diff
if window in resampled:
resampled[window][0].append(rdm_avgSnp[0])
resampled[window][1].append(rdm_avgSnp[1])
resampled[window][2].append(rdm_avgSnp[2])
else:
resampled[window] = [[rdm_avgSnp[0]], [rdm_avgSnp[1]], [rdm_avgSnp[2]]]
counter += 1
if counter % 1000 == 0:
percentDone = (counter / float(resampling)) *100
print "Resampling Progress: "+ str(percentDone) + "%"
print 'Done in ' + str(time.clock() - start)
print 'Sorting and tie ranking the resampled distributions'
tied_ranks = {}
tied_ranks2 = {}
tied_ranks3 = {}
for window in resampled:
resampled[window][0].sort()
value = 0
tie = 0
for n in resampled[window][0]: #resampled he1
if n > value:
value = n
tie += 1
if window in tied_ranks:
tied_ranks[window].append(tie)
else:
tied_ranks[window] = [tie]
resampled[window][1].sort()
value = 0
tie = 0
for n in resampled[window][1]: #resampled he2
if n > value:
value = n
tie += 1
if window in tied_ranks2:
tied_ranks2[window].append(tie)
else:
tied_ranks2[window] = [tie]
resampled[window][2].sort()
value = 0
tie = 0
for n in resampled[window][2]: #resampled Fst
if n > value:
value = n
tie += 1
if window in tied_ranks3:
tied_ranks3[window].append(tie)
else:
tied_ranks3[window] = [tie]
print 'Done in ' + str(time.clock() - start)
allSNPsOut = []
counter = 0
for snp in sampled:
snpPerc = [1.1,1.1,0]
SizeHe1 = 0
SizeHe2 = 0
SizeFst = 0
for window in sampled[snp]:
PercentileIndex = BinarySearch()
if PercentileIndex.search(resampled[window[0]][0], window[1]):
PercHe1 = tied_ranks[window[0]][PercentileIndex.search(resampled[window[0]][0], window[1])] / float(tied_ranks[window[0]][-1])
#print tied_ranks[window[0]][PercentileIndex.search(resampled[window[0]][0], window[1])]
#print float(len(tied_ranks[window[0]]))
else:
PercHe1 = 1
if PercentileIndex.search(resampled[window[0]][1], window[2]):
PercHe2 = tied_ranks2[window[0]][PercentileIndex.search(resampled[window[0]][1], window[2])] / float(tied_ranks2[window[0]][-1])
else:
PercHe2 = 1
if PercentileIndex.search(resampled[window[0]][2], window[3]):
PercFst = tied_ranks3[window[0]][PercentileIndex.search(resampled[window[0]][2], window[3])] / float(tied_ranks3[window[0]][-1])
#print tied_ranks3[window[0]][PercentileIndex.search(resampled[window[0]][2], window[3])]
#print float(tied_ranks3[window[0]][-1])
else:
PercFst = 1
if PercHe1 < snpPerc[0]:
snpPerc[0] = PercHe1
SizeHe1 = window[0]
if PercHe2 < snpPerc[1]:
snpPerc[1] = PercHe2
SizeHe2 = window[0]
if PercFst > snpPerc[2]:
snpPerc[2] = PercFst
SizeFst = window[0]
snpArray = snp.split(',')
allSNPsOut.append([ snpArray[0] , snpArray[1] , int(snpArray[2]) ,[ '\t' + str(SizeHe1) + '\t'+ str(snpPerc[0]) + '\t'+ str(SizeHe2) + '\t' + str(snpPerc[1]) + '\t' + str(SizeFst) + '\t' + str(snpPerc[2]) + '\n']])
counter += 1
if counter % 100 == 0:
percentDone = (counter / SIZE) *100
print "Percentile Calculation Progress: "+ str(percentDone) + "%"
allSNPsOut.sort(key=itemgetter(1,2))
for item in allSNPsOut:
outputFH.write(str(item[0])+'\t'+str(item[1])+'\t'+str(item[2])+item[3][0])
print 'Done in ' + str(time.clock() - start)
#Now let's build the report of identified sweeps
print "Building the Report"
outputFH.flush()
outputFH.close()
inputFH = open('SmileFinderCompleteTable.csv', 'r')
report.write('Type of Selection\t' + 'SNP'+'\t'+ 'Max window He1'+ '\t' + 'He1 Percentile'+ '\t '+ 'Maxwindow He2'+
'\t'+ 'He2 Percentile' + '\t'+ 'Max window Fst' + '\t'+ 'Fst Percentile' + '\n')
inputFH.readline()
for line in inputFH:
array = line.split('\t')
if float(array[2]) < sensitivity and float(array[4]) < sensitivity and float(array[6]) < sensitivity:
report.write('Ancestral\t' + line)
if float(array[2]) < sensitivity and float(array[4]) < sensitivity and float(array[6]) > (1-sensitivity):
report.write('Recent in both Populations\t' + line)
if float(array[2]) > sensitivity and float(array[4]) < sensitivity and float(array[6]) > (1-sensitivity):
report.write('Recent in Population 2\t' + line)
if float(array[2]) < sensitivity and float(array[4]) > sensitivity and float(array[6]) > (1-sensitivity):
report.write('Recent in Population 1\t' + line)