-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathmakeGenomeInfo
executable file
·107 lines (93 loc) · 3.54 KB
/
makeGenomeInfo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
from collections import namedtuple
import os, logging, string
from os.path import join, isfile, dirname
# when many genomes are installed, the list of genome info is better kept in a single
# file, so it is faster to read.
# this file goes through all subdirectories and collect info from all genomeInfo.tab file and
# write to genomes/genomeInfo.all.tab
# crispor.py will detect if this file is present and use it.
# do not forget to recreate the file each time you add a genome
# subdirectory with genome info
genomesDir = join(dirname(__file__), "genomes")
def readVarDbs(db):
""" find all possible variant VCFs and return as list of (shortLabel, fname, label, hasAF)
hasAF = file has the AF field (allele frequency). Means that the UI
will show the "frequency filter" button.
"""
# parse the descriptions of the VCF files
# descriptions are optional
labelFname = join(genomesDir, db, "vcfDescs.txt")
ret = []
if isfile(labelFname):
for line in open(labelFname):
if line.startswith("#"):
continue
fields = line.rstrip("\n").split("\t")
if len(fields)==4:
shortLabel, fname, desc, hasAF = fields
else:
print(labelFname, line)
assert(False)
fpath = join(genomesDir, db, fname)
if not isfile(fpath):
print("Error: Cannot find VCF file %s" % fpath)
continue
hasAF = (hasAF=="1")
ret.append( (shortLabel, fname, desc, hasAF) )
return ret
def lineFileNext(fh):
"""
parses tab-sep file with headers as field names
yields collection.namedtuples
strips "#"-prefix from header line
"""
line1 = fh.readline()
line1 = line1.strip("\n").strip("#")
headers = line1.split("\t")
Record = namedtuple('tsvRec', headers)
for line in fh:
line = line.rstrip("\n")
fields = line.split("\t")
try:
rec = Record(*fields)
except Exception as msg:
logging.error("Exception occured while parsing line, %s" % msg)
logging.error("Filename %s" % fh.name)
logging.error("Line was: %s" % repr(line))
logging.error("Does number of fields match headers?")
logging.error("Headers are: %s" % headers)
#raise Exception("wrong field count in line %s" % line)
continue
# convert fields to correct data type
yield rec
fields = ["name","description", "genome", "scientificName", "url", "server"]
tmpFname = join(genomesDir, "genomeInfo.all.tab.tmp")
outFname = join(genomesDir, "genomeInfo.all.tab")
ofh = open(tmpFname, "w")
ofh.write("\t".join(fields)+"\n")
gCount = 0
for subDir in os.listdir(genomesDir):
infoFname = join(genomesDir, subDir, "genomeInfo.tab")
varDbs = readVarDbs(subDir)
if len(varDbs)!=0:
varDescStr = " + SNPs: "+(", ".join([row[0] for row in varDbs]))
else:
varDescStr = ""
if len(varDescStr)>30:
varDescStr = varDescStr[:30]+"..."
if not isfile(infoFname):
continue
for row in lineFileNext(open(infoFname)):
newRow = []
rowDict = row._asdict()
for field in fields:
val = rowDict.get(field, "")
if field=="description":
val += varDescStr
newRow.append(val)
ofh.write("\t".join(newRow)+"\n")
gCount += 1
ofh.close()
os.rename(tmpFname, outFname)
print("Recreated %s, %d genomes" % (outFname, gCount))