Merge pull request #60 from DittmarLab/2.0b2

Fixed bug in database.build_blast_db
qiyunlab · Jul 29, 2020 · ba780df · ba780df
2 parents 3061428 + 89aef9d
commit ba780df
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 21 deletions.
diff --git a/hgtector/database.py b/hgtector/database.py
@@ -591,7 +591,7 @@ def write_prot():
                 fin = gzip.open(fp, 'r')
             cp = None
             for line in fin:
-                line = line.rstrip()
+                line = line.rstrip('\r\n')
                 if line.startswith('>'):
                     write_prot()
                     p, name = line[1:].split(None, 1)
@@ -713,7 +713,7 @@ def build_taxdump(self):
             fo = open(join(dir_, fname), 'w')
             fi = open(join(self.tmpdir, fname), 'r')
             for line in fi:
-                row = line.rstrip().replace('\t|', '').split('\t')
+                row = line.rstrip('\r\n').replace('\t|', '').split('\t')
                 if row[0] in ancs:
                     if fname == 'nodes.dmp' or 'scientific name' in row:
                         fo.write(line)
@@ -748,6 +748,13 @@ def compile_database(self):
     def build_blast_db(self):
         """Build BLAST database.
         """
+        # create temporary taxon map
+        taxonmap = join(self.tmpdir, 'taxon.map')
+        with open(taxonmap, 'w') as f:
+            for p, tid in sorted(self.taxonmap.items()):
+                f.write(f'{p}\t{tid}\n')
+
+        # build BLAST database
         makedirs(join(self.output, 'blast'), exist_ok=True)
         cmd = ' '.join((
             self.makeblastdb,
@@ -756,11 +763,14 @@ def build_blast_db(self):
             '-out', join(self.output, 'blast', 'db'),
             '-title', 'db',
             '-parse_seqids',
-            '-taxid_map', join(self.output, 'taxon.map')))
+            '-taxid_map', taxonmap))
         print('Build BLAST database...', flush=True)
         ec, out = run_command(cmd)
         if ec:
             raise ValueError(f'makeblastdb failed with error code {ec}.')
+
+        # clean up
+        remove(taxonmap)
         print('Done.')
 
     def build_diamond_db(self):

diff --git a/hgtector/search.py b/hgtector/search.py
@@ -562,7 +562,7 @@ def input_wf(self):
                 scores = {}
                 with open(file, 'r') as f:
                     for line in f:
-                        line = line.rstrip()
+                        line = line.rstrip('\r\n')
                         if not line or line.startswith('#'):
                             continue
                         id_, score = line.split('\t')
@@ -1110,7 +1110,7 @@ def parse_prev_results(fp, prots):
         with open(fp, 'r') as f:
             for line in f:
                 if line.startswith('# ID: '):
-                    done.append(line[6:].rstrip())
+                    done.append(line[6:].rstrip('\r\n'))
         doneset = set(done)
         for prot in prots:
             if prot['id'] in doneset:
@@ -1675,7 +1675,7 @@ def parse_hit_table(self, file, lenmap=None):
         lines = []
         with open(file, 'r') as f:
             for line in f:
-                line = line.rstrip()
+                line = line.rstrip('\r\n')
                 if line and not line.startswith('#'):
                     lines.append(line)
                     if ism8 is None:
@@ -1702,7 +1702,7 @@ def parse_def_table(self, lines):
         ths = {x: getattr(self, x, 0) for x in (
             'evalue', 'identity', 'coverage', 'maxhits')}
         for line in lines:
-            line = line.rstrip()
+            line = line.rstrip('\r\n')
             if not line or line.startswith('#'):
                 continue
             x = line.split('\t')
@@ -1756,7 +1756,7 @@ def parse_m8_table(self, lines, lenmap):
         ths = {x: getattr(self, x, 0) for x in (
             'evalue', 'identity', 'coverage', 'maxhits')}
         for line in lines:
-            line = line.rstrip()
+            line = line.rstrip('\r\n')
             if not line or line.startswith('#'):
                 continue
             x = line.split('\t')
@@ -2238,7 +2238,7 @@ def parse_self_m8(lines):
         res = []
         used = set()
         for line in lines:
-            x = line.rstrip().split('\t')
+            x = line.rstrip('\r\n').split('\t')
             if x[0].startswith('#'):
                 continue
             if len(x) < 12:

diff --git a/hgtector/tests/test_database.py b/hgtector/tests/test_database.py
@@ -300,14 +300,12 @@ def test_compile_database(self):
         # get database files
         copy(join(self.datadir, 'DnaK', 'linear.faa'),
              join(self.tmpdir, 'db.faa'))
-        copy(join(self.datadir, 'DnaK', 'prot2tid.txt'),
-             join(self.tmpdir, 'taxon.map'))
         makedirs(join(self.tmpdir, 'taxdump'))
         copy(join(self.datadir, 'DnaK', 'taxdump', 'nodes.dmp'),
              join(self.tmpdir, 'taxdump', 'nodes.dmp'))
         copy(join(self.datadir, 'DnaK', 'taxdump', 'names.dmp'),
              join(self.tmpdir, 'taxdump', 'names.dmp'))
-        with open(join(self.tmpdir, 'taxon.map'), 'r') as f:
+        with open(join(self.datadir, 'DnaK', 'prot2tid.txt'), 'r') as f:
             me.taxonmap = dict(x.split('\t') for x in f.read().splitlines())
 
         # set parameters
@@ -344,24 +342,23 @@ def test_compile_database(self):
 
         # clean up
         remove(join(self.tmpdir, 'db.faa'))
-        remove(join(self.tmpdir, 'taxon.map'))
         rmtree(join(self.tmpdir, 'taxdump'))
 
     def test_build_blast_db(self):
         me = Database()
         me.output = self.tmpdir
         me.makeblastdb = 'makeblastdb'
+        me.tmpdir = self.tmpdir
         copy(join(self.datadir, 'DnaK', 'linear.faa'),
              join(self.tmpdir, 'db.faa'))
-        copy(join(self.datadir, 'DnaK', 'prot2tid.txt'),
-             join(self.tmpdir, 'taxon.map'))
+        with open(join(self.datadir, 'DnaK', 'prot2tid.txt'), 'r') as f:
+            me.taxonmap = dict(x.split('\t') for x in f.read().splitlines())
         me.build_blast_db()
         self.assertTrue(isdir(join(self.tmpdir, 'blast')))
         for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'):
             self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}')))
         rmtree(join(self.tmpdir, 'blast'))
         remove(join(self.tmpdir, 'db.faa'))
-        remove(join(self.tmpdir, 'taxon.map'))
 
     def test_build_diamond_db(self):
         me = Database()

diff --git a/hgtector/util.py b/hgtector/util.py
@@ -326,7 +326,7 @@ def read_input_prots(fp):
     lines = []
     with read_file(fp) as f:
         for line in f:
-            line = line.rstrip()
+            line = line.rstrip('\r\n')
             if not line or line.startswith('#'):
                 continue
 
@@ -375,7 +375,7 @@ def read_fasta(lines):
     """
     seqs = []
     for line in lines:
-        line = line.rstrip()
+        line = line.rstrip('\r\n')
         if line.startswith('>'):
             x = line[1:].split(None, 1)
             seqs.append([x[0], get_product(x[1]) if len(x) > 1 else '', ''])
@@ -579,11 +579,11 @@ def read_taxdump(dir_):
     taxdump = {}
     with open(join(dir_, 'nodes.dmp'), 'r') as f:
         for line in f:
-            x = line.rstrip().replace('\t|', '').split('\t')
+            x = line.rstrip('\r\n').replace('\t|', '').split('\t')
             taxdump[x[0]] = {'parent': x[1], 'rank': x[2]}
     with open(join(dir_, 'names.dmp'), 'r') as f:
         for line in f:
-            x = line.rstrip().replace('\t|', '').split('\t')
+            x = line.rstrip('\r\n').replace('\t|', '').split('\t')
             if len(x) < 4 or x[3] == 'scientific name':
                 try:
                     taxdump[x[0]]['name'] = x[1]
@@ -616,7 +616,7 @@ def read_prot2taxid(file):
     res = {}
     with read_file(file) as f:
         for line in f:
-            x = line.rstrip().split('\t')
+            x = line.rstrip('\r\n').split('\t')
             if len(x) == 1:
                 continue
             if isncbi is None: