-
Notifications
You must be signed in to change notification settings - Fork 0
/
VCF-to-Tab_to_Fasta_IUPAC_Converter.py
56 lines (43 loc) · 2.07 KB
/
VCF-to-Tab_to_Fasta_IUPAC_Converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/python
def usage():
print("""
This script is intended for modifying files from vcftools that contain
biallelic information and convert them to fasta format. After the vcf file has
been exported using the vcf-to-tab program from VCFTools, and transposed in R,
or Excel, this script will change biallelic information at each site to only
one nucleotide using UIPAC conventions when the alleles are different
(heterozygous) or to the available nucleotide if both alleles are the same.
If one alle is present and the other one is missing, the script will change
the site to the available allele. All of these changes will be saved to a
new file in fasta format.
written by Simon Uribe-Convers - www.simonuribe.com
October 23rd, 2017
To use this script, type: python3.6 VCF-to-Tab_to_Fasta_IUPAC_Converter.py VCF-to-Tab_file Output_file
""")
import sys
import __future__
if __name__ == "__main__":
if len(sys.argv) != 3:
usage()
print("~~~~Error~~~~\n\nCorrect usage: python3.6 "+sys.argv[0]+" VCF-to-Tab file + Output file")
sys.exit("Missing either the VCF-to-Tab and/or the output files!")
filename = open(sys.argv[1], "r")
outfile = open(sys.argv[2] + ".fasta", "w")
# def IUPAC_converter(filename, outfile):
IUPAC_Codes = { "G/G" : "G", "C/C" : "C", "T/T" : "T", "A/A" : "A",
"-/G" : "G", "-/C" : "C", "-/T" : "T", "-/A" : "A", "G/-" : "G",
"C/-" : "C", "T/-" : "T", "A/-" : "A", "G/T" : "K", "T/G" : "K",
"A/C" : "M", "C/A" : "M", "C/G" : "S", "G/C" : "S", "A/G" : "R",
"G/A" : "R", "A/T" : "W", "T/A" : "W", "C/T" : "Y", "T/C" : "Y",
"./." : "N", "-/-" : "N", "N/N" : "N", "-/N" : "N", "N/-" : "N",
"N/." : "N", "./N" : "N"
}
for line in filename:
species_name = line.strip().split(" ")[0]
data = line.strip().split(" ")[1:]
new_data = [IUPAC_Codes[i] for i in data]
# print(new_data)
new_data2 = "".join(new_data)
outfile.write(">" + species_name + "\n" + new_data2 + "\n")
print("\n\n~~~~\n\nResults can be found in %s.fasta\n\n~~~~" %sys.argv[2])
sys.exit(0)