Merge pull request #61 from DittmarLab/2.0b2

added alignment thresholds
qiyunlab · Dec 27, 2020 · 329193c · 329193c
2 parents ba780df + 5d52c48
commit 329193c
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 11 deletions.
diff --git a/hgtector/analyze.py b/hgtector/analyze.py
@@ -254,12 +254,14 @@ def read_input(self):
         print('Reading homology search results...')
         self.data = {}
         for sid, fname in self.input_map.items():
-            self.data[sid] = self.read_search_results(fname)
+            self.data[sid] = self.read_search_results(
+                fname, self.maxhits, self.evalue, self.identity, self.coverage)
             print(f'  {sid}: {len(self.data[sid])} proteins.')
         print(f'Done. Read search results of {len(self.data)} samples.')
 
     @staticmethod
-    def read_search_results(file, maxhits=None):
+    def read_search_results(file, maxhits=None, evalue=None, identity=None,
+                            coverage=None):
         """Read homology search results of one sample.
 
         Parameters
@@ -268,35 +270,41 @@ def read_search_results(file, maxhits=None):
             input filepath
         maxhits : int
             maximum number of hits per protein to preserve
+        evalue : float
+            maximum E-value cutoff
+        identity : int
+            minimum percent identity cutoff
+        coverage : int
+            minimum percent query coverage cutoff
 
         Returns
         -------
         list of dict
             search results
         """
+        # read research result file
         p = re.compile(r'# (\S+): (.*)')
         data = []
         with read_file(file) as f:
             for line in f:
                 line = line.rstrip('\r\n')
                 m = p.match(line)
                 if m:
-                    if m.group(1) == 'ID':
+                    key = m.group(1)
+                    if key == 'ID':
                         data.append({'id': m.group(2), 'hits': []})
-                    elif m.group(1) == 'Length':
+                    elif key == 'Length':
                         data[-1]['length'] = int(m.group(2))
-                    elif m.group(1) == 'Product':
+                    elif key == 'Product':
                         data[-1]['product'] = m.group(2)
-                    elif m.group(1) == 'Score':
+                    elif key == 'Score':
                         data[-1]['score'] = float(m.group(2))
                 else:
                     data[-1]['hits'].append(line)
-                    if len(data[-1]['hits']) == (maxhits or 0):
-                        break
 
-        # convert hit table to DataFrame
         for i in range(len(data)):
-            data[i]['hits'] = pd.read_csv(
+            # convert hit table to DataFrame
+            hits = pd.read_csv(
                 StringIO('\n'.join(data[i]['hits'])), sep='\t', na_values='*',
                 names=['id', 'identity', 'evalue', 'score', 'coverage',
                        'taxid'],
@@ -306,6 +314,14 @@ def read_search_results(file, maxhits=None):
                        'score': np.float32,
                        'coverage': np.float32,
                        'taxid': str}).set_index('id')
+
+            # filter hits by thresholds
+            hits = hits.query(
+                'evalue <= {} & identity >= {} & coverage >= {}'.format(
+                    (evalue or 100), (identity or 0), (coverage or 0)))
+
+            # limit number of hits
+            data[i]['hits'] = hits.head(maxhits) if maxhits else hits
         return data
 
     def assign_taxonomy(self):

diff --git a/hgtector/tests/test_analyze.py b/hgtector/tests/test_analyze.py
@@ -41,6 +41,7 @@ def tearDown(self):
     def test___call__(self):
         # run Ecoli sample using the Silverman method
         me = Analyze()
+
         def args(): return None
         args.input = join(self.datadir, 'Ecoli', 'search')
         args.output = join(self.tmpdir, 'output')
@@ -67,6 +68,7 @@ def args(): return None
     def test_set_parameters(self):
         me = Analyze()
         me.cfg = load_configs()
+
         def args(): return None
 
         # input is file
@@ -179,7 +181,12 @@ def test_read_search_results(self):
         self.assertEqual(obs[0]['hits']['taxid']['NP_384288.1'], '266834')
 
         # maximum number of hits
-        obs = Analyze.read_search_results(file, 5)
+        obs = Analyze.read_search_results(file, maxhits=8)
+        self.assertEqual(len(obs[0]['hits']), 8)
+
+        # alignment thresholds
+        obs = Analyze.read_search_results(
+            file, evalue=1e-50, identity=95, coverage=95)
         self.assertEqual(len(obs[0]['hits']), 5)
 
     def test_assign_taxonomy(self):