Skip to content

Commit

Permalink
Merge pull request #61 from DittmarLab/2.0b2
Browse files Browse the repository at this point in the history
added alignment thresholds
  • Loading branch information
qiyunzhu authored Dec 27, 2020
2 parents ba780df + 5d52c48 commit 329193c
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 11 deletions.
36 changes: 26 additions & 10 deletions hgtector/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,14 @@ def read_input(self):
print('Reading homology search results...')
self.data = {}
for sid, fname in self.input_map.items():
self.data[sid] = self.read_search_results(fname)
self.data[sid] = self.read_search_results(
fname, self.maxhits, self.evalue, self.identity, self.coverage)
print(f' {sid}: {len(self.data[sid])} proteins.')
print(f'Done. Read search results of {len(self.data)} samples.')

@staticmethod
def read_search_results(file, maxhits=None):
def read_search_results(file, maxhits=None, evalue=None, identity=None,
coverage=None):
"""Read homology search results of one sample.
Parameters
Expand All @@ -268,35 +270,41 @@ def read_search_results(file, maxhits=None):
input filepath
maxhits : int
maximum number of hits per protein to preserve
evalue : float
maximum E-value cutoff
identity : int
minimum percent identity cutoff
coverage : int
minimum percent query coverage cutoff
Returns
-------
list of dict
search results
"""
# read research result file
p = re.compile(r'# (\S+): (.*)')
data = []
with read_file(file) as f:
for line in f:
line = line.rstrip('\r\n')
m = p.match(line)
if m:
if m.group(1) == 'ID':
key = m.group(1)
if key == 'ID':
data.append({'id': m.group(2), 'hits': []})
elif m.group(1) == 'Length':
elif key == 'Length':
data[-1]['length'] = int(m.group(2))
elif m.group(1) == 'Product':
elif key == 'Product':
data[-1]['product'] = m.group(2)
elif m.group(1) == 'Score':
elif key == 'Score':
data[-1]['score'] = float(m.group(2))
else:
data[-1]['hits'].append(line)
if len(data[-1]['hits']) == (maxhits or 0):
break

# convert hit table to DataFrame
for i in range(len(data)):
data[i]['hits'] = pd.read_csv(
# convert hit table to DataFrame
hits = pd.read_csv(
StringIO('\n'.join(data[i]['hits'])), sep='\t', na_values='*',
names=['id', 'identity', 'evalue', 'score', 'coverage',
'taxid'],
Expand All @@ -306,6 +314,14 @@ def read_search_results(file, maxhits=None):
'score': np.float32,
'coverage': np.float32,
'taxid': str}).set_index('id')

# filter hits by thresholds
hits = hits.query(
'evalue <= {} & identity >= {} & coverage >= {}'.format(
(evalue or 100), (identity or 0), (coverage or 0)))

# limit number of hits
data[i]['hits'] = hits.head(maxhits) if maxhits else hits
return data

def assign_taxonomy(self):
Expand Down
9 changes: 8 additions & 1 deletion hgtector/tests/test_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def tearDown(self):
def test___call__(self):
# run Ecoli sample using the Silverman method
me = Analyze()

def args(): return None
args.input = join(self.datadir, 'Ecoli', 'search')
args.output = join(self.tmpdir, 'output')
Expand All @@ -67,6 +68,7 @@ def args(): return None
def test_set_parameters(self):
me = Analyze()
me.cfg = load_configs()

def args(): return None

# input is file
Expand Down Expand Up @@ -179,7 +181,12 @@ def test_read_search_results(self):
self.assertEqual(obs[0]['hits']['taxid']['NP_384288.1'], '266834')

# maximum number of hits
obs = Analyze.read_search_results(file, 5)
obs = Analyze.read_search_results(file, maxhits=8)
self.assertEqual(len(obs[0]['hits']), 8)

# alignment thresholds
obs = Analyze.read_search_results(
file, evalue=1e-50, identity=95, coverage=95)
self.assertEqual(len(obs[0]['hits']), 5)

def test_assign_taxonomy(self):
Expand Down

0 comments on commit 329193c

Please sign in to comment.