Skip to content

Commit

Permalink
Merge pull request #60 from DittmarLab/2.0b2
Browse files Browse the repository at this point in the history
Fixed bug in database.build_blast_db
  • Loading branch information
qiyunzhu authored Jul 29, 2020
2 parents 3061428 + 89aef9d commit ba780df
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 21 deletions.
16 changes: 13 additions & 3 deletions hgtector/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ def write_prot():
fin = gzip.open(fp, 'r')
cp = None
for line in fin:
line = line.rstrip()
line = line.rstrip('\r\n')
if line.startswith('>'):
write_prot()
p, name = line[1:].split(None, 1)
Expand Down Expand Up @@ -713,7 +713,7 @@ def build_taxdump(self):
fo = open(join(dir_, fname), 'w')
fi = open(join(self.tmpdir, fname), 'r')
for line in fi:
row = line.rstrip().replace('\t|', '').split('\t')
row = line.rstrip('\r\n').replace('\t|', '').split('\t')
if row[0] in ancs:
if fname == 'nodes.dmp' or 'scientific name' in row:
fo.write(line)
Expand Down Expand Up @@ -748,6 +748,13 @@ def compile_database(self):
def build_blast_db(self):
"""Build BLAST database.
"""
# create temporary taxon map
taxonmap = join(self.tmpdir, 'taxon.map')
with open(taxonmap, 'w') as f:
for p, tid in sorted(self.taxonmap.items()):
f.write(f'{p}\t{tid}\n')

# build BLAST database
makedirs(join(self.output, 'blast'), exist_ok=True)
cmd = ' '.join((
self.makeblastdb,
Expand All @@ -756,11 +763,14 @@ def build_blast_db(self):
'-out', join(self.output, 'blast', 'db'),
'-title', 'db',
'-parse_seqids',
'-taxid_map', join(self.output, 'taxon.map')))
'-taxid_map', taxonmap))
print('Build BLAST database...', flush=True)
ec, out = run_command(cmd)
if ec:
raise ValueError(f'makeblastdb failed with error code {ec}.')

# clean up
remove(taxonmap)
print('Done.')

def build_diamond_db(self):
Expand Down
12 changes: 6 additions & 6 deletions hgtector/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def input_wf(self):
scores = {}
with open(file, 'r') as f:
for line in f:
line = line.rstrip()
line = line.rstrip('\r\n')
if not line or line.startswith('#'):
continue
id_, score = line.split('\t')
Expand Down Expand Up @@ -1110,7 +1110,7 @@ def parse_prev_results(fp, prots):
with open(fp, 'r') as f:
for line in f:
if line.startswith('# ID: '):
done.append(line[6:].rstrip())
done.append(line[6:].rstrip('\r\n'))
doneset = set(done)
for prot in prots:
if prot['id'] in doneset:
Expand Down Expand Up @@ -1675,7 +1675,7 @@ def parse_hit_table(self, file, lenmap=None):
lines = []
with open(file, 'r') as f:
for line in f:
line = line.rstrip()
line = line.rstrip('\r\n')
if line and not line.startswith('#'):
lines.append(line)
if ism8 is None:
Expand All @@ -1702,7 +1702,7 @@ def parse_def_table(self, lines):
ths = {x: getattr(self, x, 0) for x in (
'evalue', 'identity', 'coverage', 'maxhits')}
for line in lines:
line = line.rstrip()
line = line.rstrip('\r\n')
if not line or line.startswith('#'):
continue
x = line.split('\t')
Expand Down Expand Up @@ -1756,7 +1756,7 @@ def parse_m8_table(self, lines, lenmap):
ths = {x: getattr(self, x, 0) for x in (
'evalue', 'identity', 'coverage', 'maxhits')}
for line in lines:
line = line.rstrip()
line = line.rstrip('\r\n')
if not line or line.startswith('#'):
continue
x = line.split('\t')
Expand Down Expand Up @@ -2238,7 +2238,7 @@ def parse_self_m8(lines):
res = []
used = set()
for line in lines:
x = line.rstrip().split('\t')
x = line.rstrip('\r\n').split('\t')
if x[0].startswith('#'):
continue
if len(x) < 12:
Expand Down
11 changes: 4 additions & 7 deletions hgtector/tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,14 +300,12 @@ def test_compile_database(self):
# get database files
copy(join(self.datadir, 'DnaK', 'linear.faa'),
join(self.tmpdir, 'db.faa'))
copy(join(self.datadir, 'DnaK', 'prot2tid.txt'),
join(self.tmpdir, 'taxon.map'))
makedirs(join(self.tmpdir, 'taxdump'))
copy(join(self.datadir, 'DnaK', 'taxdump', 'nodes.dmp'),
join(self.tmpdir, 'taxdump', 'nodes.dmp'))
copy(join(self.datadir, 'DnaK', 'taxdump', 'names.dmp'),
join(self.tmpdir, 'taxdump', 'names.dmp'))
with open(join(self.tmpdir, 'taxon.map'), 'r') as f:
with open(join(self.datadir, 'DnaK', 'prot2tid.txt'), 'r') as f:
me.taxonmap = dict(x.split('\t') for x in f.read().splitlines())

# set parameters
Expand Down Expand Up @@ -344,24 +342,23 @@ def test_compile_database(self):

# clean up
remove(join(self.tmpdir, 'db.faa'))
remove(join(self.tmpdir, 'taxon.map'))
rmtree(join(self.tmpdir, 'taxdump'))

def test_build_blast_db(self):
me = Database()
me.output = self.tmpdir
me.makeblastdb = 'makeblastdb'
me.tmpdir = self.tmpdir
copy(join(self.datadir, 'DnaK', 'linear.faa'),
join(self.tmpdir, 'db.faa'))
copy(join(self.datadir, 'DnaK', 'prot2tid.txt'),
join(self.tmpdir, 'taxon.map'))
with open(join(self.datadir, 'DnaK', 'prot2tid.txt'), 'r') as f:
me.taxonmap = dict(x.split('\t') for x in f.read().splitlines())
me.build_blast_db()
self.assertTrue(isdir(join(self.tmpdir, 'blast')))
for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'):
self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}')))
rmtree(join(self.tmpdir, 'blast'))
remove(join(self.tmpdir, 'db.faa'))
remove(join(self.tmpdir, 'taxon.map'))

def test_build_diamond_db(self):
me = Database()
Expand Down
10 changes: 5 additions & 5 deletions hgtector/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def read_input_prots(fp):
lines = []
with read_file(fp) as f:
for line in f:
line = line.rstrip()
line = line.rstrip('\r\n')
if not line or line.startswith('#'):
continue

Expand Down Expand Up @@ -375,7 +375,7 @@ def read_fasta(lines):
"""
seqs = []
for line in lines:
line = line.rstrip()
line = line.rstrip('\r\n')
if line.startswith('>'):
x = line[1:].split(None, 1)
seqs.append([x[0], get_product(x[1]) if len(x) > 1 else '', ''])
Expand Down Expand Up @@ -579,11 +579,11 @@ def read_taxdump(dir_):
taxdump = {}
with open(join(dir_, 'nodes.dmp'), 'r') as f:
for line in f:
x = line.rstrip().replace('\t|', '').split('\t')
x = line.rstrip('\r\n').replace('\t|', '').split('\t')
taxdump[x[0]] = {'parent': x[1], 'rank': x[2]}
with open(join(dir_, 'names.dmp'), 'r') as f:
for line in f:
x = line.rstrip().replace('\t|', '').split('\t')
x = line.rstrip('\r\n').replace('\t|', '').split('\t')
if len(x) < 4 or x[3] == 'scientific name':
try:
taxdump[x[0]]['name'] = x[1]
Expand Down Expand Up @@ -616,7 +616,7 @@ def read_prot2taxid(file):
res = {}
with read_file(file) as f:
for line in f:
x = line.rstrip().split('\t')
x = line.rstrip('\r\n').split('\t')
if len(x) == 1:
continue
if isncbi is None:
Expand Down

0 comments on commit ba780df

Please sign in to comment.