smilefinder.py

#!/usr/bin/python
import sys
import random
import numpy
import scipy.stats
from collections import deque
import time
import math
from operator import itemgetter, attrgetter

class BinarySearch:
    NOT_FOUND = False

    def binarySearch(self, list, value, left, right):
        if right < left:
            return self.NOT_FOUND

        mid = (left + right) / 2

        if right - left == 1:
        #if list[mid-1] < value <= list[mid]:
            return right
        #if list[mid] <= value < list[mid+1]:
         #   return mid

        if value > list[mid]:
            #return self.binarySearch(list, value, mid+1, right)
            return self.binarySearch(list, value, mid, right)

        elif value <= list[mid]:
            #return self.binarySearch(list, value, left, mid - 1)
            return self.binarySearch(list, value, left, mid)

        #else:
         #   return mid

    def search(self, list, value):
        left = 0
        right = len(list) - 1
        return self.binarySearch(list, value, left, right)

start = time.clock()

print 'What is the complete name of the input file ?'
input = raw_input('-->')
input = input[:-1]
inputFH = open(input, 'r')

print 'Choose the name of the report file?'
reportFH = raw_input('-->')
reportFH = reportFH[:-1]
report = open(reportFH, 'w')


outputFH = open('SmileFinderCompleteTable.csv', 'w')

outputFH.write('SNP\tChromosome\tPosition\t'+ 'Max window He1'+ '\t' + 'He1 Percentile'+ '\t '+ 'Maxwindow He2'+
               '\t'+ 'He2 Percentile' + '\t'+ 'Max window Fst' + '\t'+ 'Fst Percentile' + '\n')

print "How many windows ?"
numberOfWindows = int(raw_input('-->'))

print "What size for the shortest window ?"
startSize = currentSize = int(raw_input('-->'))

windowIncrement = 2
maxWindowSize = startSize + windowIncrement * (numberOfWindows - 1)

print "How much resampling ?"
resampling = int(raw_input('-->'))

print "Choose the sensitiviy"
sensitivity = raw_input('-->')
sensitivity = float(sensitivity[:-1])

print "Info:"
print "Number of windows: " + str(numberOfWindows)
print "Start size (min win size): " + str(startSize)
print "Window increment: " + str(windowIncrement)
print "Maximum window size: " + str(maxWindowSize)
print "Resampling value: " + str(resampling)


#Creating windows
windows = []
while len(windows) < numberOfWindows:
    windows.append(currentSize)
    currentSize += windowIncrement

# skip the header of input file and read it to memory

inputFH.readline()


#Getting the SNPs and values
allSNPs = []
for line in inputFH:
    snpData = line.split('\t')#character used to split might need to be changed according to the format of the input file
    #print snpData
    allSNPs.append([str(snpData[0])+','+str(snpData[1])+','+str(snpData[2]), snpData[7], snpData[12], snpData[13]]) #6,13,17 for Hexpected, 7,14,17 for Hobserved, 7,12,13 for Selsim
SIZE = float(len(allSNPs))


#Start Sampling
currentWindow = {}
sampled = {}
counter = 0

for item in allSNPs:


    for window in windows:
        avgSnp = [0,0,0]


        if window in currentWindow:

            currentWindow[window][0].append(item[0])
            currentWindow[window][1].append(float(item[1]))
            currentWindow[window][2].append(float(item[2]))
            currentWindow[window][3].append(float(item[3]))
            #print currentWindow[window][0]

        else:
            currentWindow[window] = [deque([item[0]]), deque([float(item[1])]),
                                     deque([float(item[2])]),
                                     deque([float(item[3])])]

        if len(currentWindow[window][0]) == window:
            avgSnp[0] = sum(currentWindow[window][1]) / window
            avgSnp[1] = sum(currentWindow[window][2]) / window
            mean_avgSnp_2 = sum(currentWindow[window][3]) /window
            sq_diff = 0
            for elem in currentWindow[window][3]:
                sq_diff += (elem - mean_avgSnp_2)**2 / (window -1)
            avgSnp[2] = sq_diff

            TargetIndex = window /2
            Target = currentWindow[window][0][TargetIndex]
            #print Target + '\t' + str(avgSnp[0])

            rm = currentWindow[window][0].popleft()
            rm = currentWindow[window][1].popleft()
            rm = currentWindow[window][2].popleft()
            rm = currentWindow[window][3].popleft()
            #print sampled


            if Target in sampled:
                sampled[Target].append([window, avgSnp[0], avgSnp[1], avgSnp[2]])
            else:
                sampled[Target] = [[window, avgSnp[0], avgSnp[1], avgSnp[2]]]

    counter += 1
    if counter % 1000 == 0:
        percentDone = (counter/ SIZE) * 100
        print "Sampling Progress: "+ str(percentDone) + "%"

print 'Done in ' + str(time.clock() - start)

#sampled_out = open('sampled_out', 'w')
#for Target in sampled:
 #   sampled_out.write(str(Target)+ '\t'+ str(sampled[Target])+'\n')

#print sampled

del currentWindow

#Start Resampling
currentWindow = {}
max = len(allSNPs) - 1
counter = 0

resampled = {}
for i in xrange(0, resampling):

    for window in windows:
        rdm_avgSnp = [0,0,0]
        currentWindow[window] = [[], [], []]
        while len(currentWindow[window][0]) < window:
            rdm = numpy.random.randint(0, max)
            rdmSnp = allSNPs[rdm]
            currentWindow[window][0].append(float(rdmSnp[1]))
            currentWindow[window][1].append(float(rdmSnp[2]))
            currentWindow[window][2].append(float(rdmSnp[3]))

        rdm_avgSnp[0] = sum(currentWindow[window][0]) / window
        rdm_avgSnp[1] = sum(currentWindow[window][1]) / window
        #rdm_avgSnp[2] = sum(currentWindow[window][2]) / window
        mean_rdm_avgSnp_2 = sum(currentWindow[window][2]) /window
        sq_diff = 0
        for elem in currentWindow[window][2]:
            sq_diff += (elem - mean_avgSnp_2)**2 / (window -1)
        rdm_avgSnp[2] = sq_diff

        if window in resampled:
            resampled[window][0].append(rdm_avgSnp[0])
            resampled[window][1].append(rdm_avgSnp[1])
            resampled[window][2].append(rdm_avgSnp[2])
        else:
            resampled[window] = [[rdm_avgSnp[0]], [rdm_avgSnp[1]], [rdm_avgSnp[2]]]

    counter += 1
    if counter % 1000 == 0:
        percentDone = (counter / float(resampling)) *100
        print "Resampling Progress: "+ str(percentDone) + "%"

print 'Done in ' + str(time.clock() - start)


print 'Sorting and tie ranking the resampled distributions'
tied_ranks = {}
tied_ranks2 = {}
tied_ranks3 = {}
for window in resampled:

    resampled[window][0].sort()
    value = 0
    tie = 0
    for n in resampled[window][0]: #resampled he1
        if n > value:
            value = n
            tie += 1
        if window in tied_ranks:
            tied_ranks[window].append(tie)
        else:
            tied_ranks[window] = [tie]

    resampled[window][1].sort()
    value = 0
    tie = 0
    for n in resampled[window][1]: #resampled he2
        if n > value:
            value = n
            tie += 1
        if window in tied_ranks2:
            tied_ranks2[window].append(tie)
        else:
            tied_ranks2[window] = [tie]

    resampled[window][2].sort()
    value = 0
    tie = 0
    for n in resampled[window][2]: #resampled Fst
        if n > value:
            value = n
            tie += 1
        if window in tied_ranks3:
            tied_ranks3[window].append(tie)
        else:
            tied_ranks3[window] = [tie]

print 'Done in ' + str(time.clock() - start)


allSNPsOut = []
counter = 0
for snp in sampled:
    snpPerc = [1.1,1.1,0]
    SizeHe1 = 0
    SizeHe2 = 0
    SizeFst = 0

    for window in sampled[snp]:
        PercentileIndex = BinarySearch()
        if PercentileIndex.search(resampled[window[0]][0], window[1]):
            PercHe1 = tied_ranks[window[0]][PercentileIndex.search(resampled[window[0]][0], window[1])] / float(tied_ranks[window[0]][-1])
            #print tied_ranks[window[0]][PercentileIndex.search(resampled[window[0]][0], window[1])]
            #print float(len(tied_ranks[window[0]]))
        else:
            PercHe1 = 1
        if PercentileIndex.search(resampled[window[0]][1], window[2]):
            PercHe2 = tied_ranks2[window[0]][PercentileIndex.search(resampled[window[0]][1], window[2])] / float(tied_ranks2[window[0]][-1])
        else:
            PercHe2 = 1
        if PercentileIndex.search(resampled[window[0]][2], window[3]):
            PercFst = tied_ranks3[window[0]][PercentileIndex.search(resampled[window[0]][2], window[3])] / float(tied_ranks3[window[0]][-1])
            #print tied_ranks3[window[0]][PercentileIndex.search(resampled[window[0]][2], window[3])]
            #print float(tied_ranks3[window[0]][-1])
        else:
            PercFst = 1
        if PercHe1 < snpPerc[0]:
            snpPerc[0] = PercHe1
            SizeHe1 = window[0]
        if PercHe2 < snpPerc[1]:
            snpPerc[1] = PercHe2
            SizeHe2 = window[0]
        if PercFst > snpPerc[2]:
            snpPerc[2] = PercFst
            SizeFst = window[0]

    snpArray = snp.split(',')
    allSNPsOut.append([ snpArray[0] , snpArray[1] , int(snpArray[2]) ,[ '\t' + str(SizeHe1) + '\t'+  str(snpPerc[0]) + '\t'+ str(SizeHe2) + '\t' + str(snpPerc[1]) + '\t' + str(SizeFst) + '\t' + str(snpPerc[2]) + '\n']])
    counter += 1
    if counter % 100 == 0:
        percentDone = (counter / SIZE) *100
        print "Percentile Calculation Progress: "+ str(percentDone) + "%"

allSNPsOut.sort(key=itemgetter(1,2))
for item in allSNPsOut:
    outputFH.write(str(item[0])+'\t'+str(item[1])+'\t'+str(item[2])+item[3][0])


print 'Done in ' + str(time.clock() - start)

#Now let's build the report of identified sweeps

print "Building the Report"

outputFH.flush()
outputFH.close()

inputFH = open('SmileFinderCompleteTable.csv', 'r')
report.write('Type of Selection\t' + 'SNP'+'\t'+ 'Max window He1'+ '\t' + 'He1 Percentile'+ '\t '+ 'Maxwindow He2'+
               '\t'+ 'He2 Percentile' + '\t'+ 'Max window Fst' + '\t'+ 'Fst Percentile' + '\n')

inputFH.readline()
for line in inputFH:
    array = line.split('\t')
    if float(array[2]) < sensitivity and float(array[4]) < sensitivity and float(array[6]) < sensitivity:
        report.write('Ancestral\t' + line)
    if float(array[2]) < sensitivity and float(array[4]) < sensitivity and float(array[6]) > (1-sensitivity):
        report.write('Recent in both Populations\t' + line)
    if float(array[2]) > sensitivity and float(array[4]) < sensitivity and float(array[6]) > (1-sensitivity):
        report.write('Recent in Population 2\t' + line)
    if float(array[2]) < sensitivity and float(array[4]) > sensitivity and float(array[6]) > (1-sensitivity):
        report.write('Recent in Population 1\t' + line)