-
Notifications
You must be signed in to change notification settings - Fork 0
/
parentage_check.py
201 lines (137 loc) · 5.57 KB
/
parentage_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#########################
# parentage_check.py
#########################
"""
Which of the candidate parents actually show highest allele sharing with a given progeny?
inputs:
- genotypes for candidate parents and progeny in VCF format
- list of candidate parent IDs
- list of progeny IDs
output2:
- a matrix of allele sharing between progenies and each candidate parent, specifically:
the proportion of sites where a progeny shares one or more alleles with the candidate parent
- the same matrix but instead with RANKS: ascending order from highest proportion to lowest.
a match is defined as sharing at least one allele with the candidate parent at a given site.
a mismatch is defined as not sharing any alleles with the candidate parent at a given site.
Example usage:
python parentage_check.py --vcf test.vcf --possible_parents parents.txt --progeny progenies.txt
"""
import argparse
import os
########################## HEAD
# this function checks if file exists and exits script if not
def extant_file(x):
"""
'Type' for argparse - checks that file exists but does not open.
"""
if not os.path.exists(x):
print ("Error: {0} does not exist".format(x))
exit()
x = str(x)
return x
# breaks script if non-UNIX linebreaks in input file popmap
def linebreak_check(x):
if "\r" in open(x, "rb").readline():
print ("Error: classic mac (CR) or DOS (CRLF) linebreaks in {0}".format(x) )
exit()
######
# parses arguments from command line
def get_commandline_arguments ():
parser = argparse.ArgumentParser()
parser.add_argument("--vcf", required=True, type=extant_file,
help="name/path of vcf input file", metavar="FILE")
parser.add_argument("--possible_parents", required=True, type=extant_file,
help="name/path of possible parents input file: textfile with sample names; one line per sample", metavar="FILE")
parser.add_argument("--progeny", required=True, type=extant_file,
help="name/path of (F1) progeny input file: textfile with sample names; one line per sample", metavar="FILE")
args = parser.parse_args()
# finish
return args
#### CORE FUNCTIONS
def read_sample_list (infile):
sample_list = []
with open(infile, "r") as INFILE:
for line in INFILE:
if len(line) > 1:
fields = line.strip("\n").split()
sample_list.append(fields[0])
return list(set(sample_list))
def count_allele_sharing_and_mismatch (header_line, vcf_data, progeny, candidate_parent):
prog_i = header_line.index(progeny)
par_i = header_line.index(candidate_parent)
# print(progeny, prog_i)
# print(candidate_parent, par_i)
matches = 0
mismatches = 0
# now start the main business of walking through the vcf:
out_columns = []
linecnt = 0
snpcnt = 0
for fields in vcf_data:
alleles_prog = set( fields[prog_i].split(":")[0].split("/") )
alleles_par = set( fields[par_i].split(":")[0].split("/") )
alleles_both = alleles_par.union(alleles_prog)
# drop missing then count match or mismatch
# print (alleles_prog, alleles_par, alleles_both)
if not "." in alleles_both:
if len( alleles_prog.intersection(alleles_par) ) > 0:
matches += 1
else: # progeny does not have at least one of the parents alleles...
mismatches += 1
return (round(matches /float(matches + mismatches), 10))
def read_vcf_to_memory(infile):
with open(infile, "r") as INFILE:
for line in INFILE:
if line.startswith("##"):
continue
if line.startswith("#"):
header_line = line.lstrip("#").strip("\n").split("\t")
break
# now start the main business of walking through the vcf:
vcf_data = []
with open(infile, "r") as INFILE:
linecnt = 0
snpcnt = 0
for line in INFILE:
linecnt += 1
if line.startswith("#"):
continue
if len(line) < 2: # empty lines or so
continue
fields = line.strip("\n").split("\t")
vcf_data.append(fields)
print("parsed VCF into memory")
return header_line, vcf_data
####### MAIN
args = get_commandline_arguments ()
#print args
potential_parents = read_sample_list(args.possible_parents)
progeny = read_sample_list(args.progeny)
print ("parents = ", potential_parents)
print ("progeny = ", progeny)
header_line, vcf_data = read_vcf_to_memory(args.vcf)
outlines1 = []
outlines1.append( "proportion of sites where a progeny shares one or more alleles with the candidate parent" )
outlines1.append( "\t".join([ "progeny_ID" ] + potential_parents) )
outlines2 = []
outlines2.append( "potential parents ranked by matching proportion, 1 = highest matching proportion" )
outlines2.append( "\t".join([ "progeny_ID" ] + potential_parents) )
for prog in progeny:
candidate_parent_matches = []
for par in potential_parents:
proportion_matches = count_allele_sharing_and_mismatch (header_line, vcf_data, prog, par)
candidate_parent_matches.append( proportion_matches )
print(prog, candidate_parent_matches)
outlines1.append( "\t".join( [prog] + [str(x) for x in candidate_parent_matches ] ) )
# now the ranks: https://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python
ranked = [sorted(candidate_parent_matches).index(x) for x in candidate_parent_matches]
ranked1 = [abs(x-max(ranked))+1 for x in ranked] # make the rankes ascending
outlines2.append( "\t".join( [prog] + [str(x) for x in ranked1 ] ) )
print ("\n[finishing] writing outputs to:")
with open(args.vcf + ".parentage_check.proportion_matches.txt", "w") as O:
O.write("\n".join(outlines1) + "\n")
with open(args.vcf + ".parentage_check.ranks.txt", "w") as O:
O.write("\n".join(outlines2) + "\n")
print (args.vcf + ".parentage_check.proportion_matches.txt")
print (args.vcf + ".parentage_check.ranks.txt")
print ("Done!")