-
Notifications
You must be signed in to change notification settings - Fork 0
/
CovDetect.py
143 lines (113 loc) · 4.97 KB
/
CovDetect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#! /usr/bin/python
# Coverage detect version 1.0
# This software was written by Ivan R. Wolf
# Python modules required: Numpy, Pandas
#
import numpy as np
import pandas as pd
import collections
import argparse
def create_blocks(df, scfld, basepairs):
# Function to create blocks of genomic regions detected
blocks = []
x = 0
s = []
#basepairs = 100
if isinstance(df.loc[scfld]['base'], np.float64):
return False
else:
for i in df.loc[scfld]['base'].tolist():
if (i - x + 1) <= basepairs: # Determine range of blocks
s.append(i)
x = i
elif len(s) == 0:
s.append(i)
x = i
else:
if min(s) != max(s):
blocks.append((scfld, min(s),max(s)))
s = []
s.append(i)
x = i
else:
s = []
s.append(i)
x = i
return blocks
def save_log(gsize, sigmaCi_0B, CovMed_0B, cutoff_0B, sigmaCi_1B, CovMed_1B, cutoff_1B, selected_scfld, AvgCovNorm1B, AvgCovNorm0B, meanRatio_0B, stdv_0B, output_log):
# Save log
f=open(output_log,"w")
print >>f, "Genome size:", gsize
print >>f, "0B Coverage sum:", sigmaCi_0B
print >>f, "0B Avg Coverage:", CovMed_0B
print >>f, "0B Cutoff Value:", cutoff_0B
print >>f, "1B Coverage sum:", sigmaCi_1B
print >>f, "1B Avg Coverage:", CovMed_1B
print >>f, "1B Cutoff Value:", cutoff_1B
print >>f, "Scafold selected for normalization:", selected_scfld
print >>f, "Average of normalized coverage for 1B:", AvgCovNorm1B
print >>f, "Average of normalized coverage for 0B:", AvgCovNorm0B
print >>f, "Mean Ratio:", meanRatio_0B
print >>f, "Mean Ratio STDV:", stdv_0B
f.close()
return
def main(args):
output_table = "Seleted_bases_" + str(args.input) + "_STDV" + str(args.standev) + "_BP" + str(args.basepairs) + ".txt"
output_blocks = "Seleted_bases_" + str(args.input) + "_STDV" + str(args.standev) + "_BP" + str(args.basepairs) + ".blocks.txt"
output_log = "Seleted_bases_" + str(args.input) + "_STDV" + str(args.standev) + "_BP" + str(args.basepairs) + ".log.txt"
df = pd.read_csv(args.input,index_col=0,header=None,engine='c',sep='\t') # Load Data
df.columns = ["base","cov_0B","cov_1B"] # Rename Columns
gsize = df.shape[0]
sigmaCi_0B = df['cov_0B'].sum()
CovMed_0B = sigmaCi_0B/gsize
cutoff_0B = CovMed_0B / 2
sigmaCi_1B = df['cov_1B'].sum()
CovMed_1B = sigmaCi_1B/gsize
cutoff_1B = CovMed_1B / 2
df['size'] = pd.DataFrame.from_dict(collections.Counter(df.index.values),orient='index') # Add a size colum
df['norm_0B'] = (df['cov_0B'] / df['size']) / CovMed_0B # Add a column with normalized value for 0B
df['norm_1B'] = (df['cov_1B'] / df['size']) / CovMed_1B # Add a column with normalized value for 1B
# Get scafold info
scfld_info_0B = pd.concat([df.groupby(level=0).var()['cov_0B'],
pd.DataFrame.from_dict(collections.Counter(df.index.values),orient='index')], axis=1)
scfld_info_0B.columns = ["cov_var","size"]
scfld_0B_list = scfld_info_0B.sort_values(['cov_var', 'size'], ascending=[True, False]).index.values.tolist()
scfld_info_1B = pd.concat([df.groupby(level=0).var()['cov_1B'],
pd.DataFrame.from_dict(collections.Counter(df.index.values),orient='index')], axis=1) # Join information
scfld_info_1B.columns = ["cov_var","size"]
scfld_1B_list = scfld_info_1B.sort_values(['cov_var', 'size'], ascending=[True, False]).index.values.tolist()
# Select scafold and calculate mean ratio
selected_scfld = [i for i, j in zip(scfld_0B_list, scfld_1B_list) if i == j][0]
AvgCovNorm1B = df.loc[selected_scfld]['norm_1B'].mean()
AvgCovNorm0B = df.loc[selected_scfld]['norm_0B'].mean()
meanRatio_0B = AvgCovNorm1B / AvgCovNorm0B
stdv_0B = df.loc[selected_scfld]['norm_0B'].std()
# Filter table based on cutoffs
df = df[df['cov_1B'] >= cutoff_1B]
df = df[df['cov_0B'] >= cutoff_0B]
# Add the ratio for 1B
df['ratio_1B'] = df['norm_1B'] / df['norm_0B']
# Save bases with additional info
#(df[df['ratio_1B'] >= (meanRatio_0B + stdv_0B)]).to_csv("Selected_bases.txt",sep='\t')
df = df[df['ratio_1B'] >= (meanRatio_0B + (stdv_0B * args.standev))] # Determine number of stdv
save_log(gsize, sigmaCi_0B, CovMed_0B, cutoff_0B, sigmaCi_1B, CovMed_1B, cutoff_1B, selected_scfld, AvgCovNorm1B, AvgCovNorm0B, meanRatio_0B, stdv_0B, output_log)
df.to_csv(output_table,sep='\t')
block_df = pd.DataFrame()
for scfld in set(df.index.values):
block_list = create_blocks(df,scfld, args.basepairs)
if block_list == False:
continue
else:
block_df = pd.concat([block_df, pd.DataFrame(block_list)])
block_df.columns = ["Scafold","start","end"]
block_df.to_csv(output_blocks,sep='\t',index=False)
def init():
parser = argparse.ArgumentParser(description="Retrieve regions with coverage above threshould between two different experiments")
parser.add_argument("input", help="Input File")
parser.add_argument("-bp", "--basepairs", type=int, default=100, help="Determine base pairs for sliding bases")
parser.add_argument("-stdv", "--standev", type=int, default=2, help="Number of standard deviations used to select the bases")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = init()
main(args)