Skip to content

Commit

Permalink
Refactor to method
Browse files Browse the repository at this point in the history
  • Loading branch information
colinbrislawn committed Oct 3, 2024
1 parent f24f45f commit 690f18e
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 182 deletions.
44 changes: 8 additions & 36 deletions q2_vsearch/_chimera.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from ._format import UchimeStatsFmt


_uchime_defaults = {'dn': 1.4,
_uchime_defaults = {'method': 'uchime',
'dn': 1.4,
'mindiffs': 3,
'mindiv': 0.8,
'minh': 0.28,
Expand Down Expand Up @@ -68,26 +69,29 @@ def _uchime_ref(sequences, table, reference_sequences, dn, mindiffs,

def uchime_denovo(sequences: DNAFASTAFormat,
table: biom.Table,
method: str = _uchime_defaults['method'],
dn: float = _uchime_defaults['dn'],
mindiffs: int = _uchime_defaults['mindiffs'],
mindiv: float = _uchime_defaults['mindiv'],
minh: float = _uchime_defaults['minh'],
xn: float = _uchime_defaults['xn']) \
-> (DNAFASTAFormat, DNAFASTAFormat, UchimeStatsFmt):
cmd, chimeras, nonchimeras, uchime_stats = \
_uchime_denovo(sequences, table, dn, mindiffs, mindiv, minh, xn)
_uchime_denovo(sequences, table, method,
dn, mindiffs, mindiv, minh, xn)
return chimeras, nonchimeras, uchime_stats


def _uchime_denovo(sequences, table, dn, mindiffs, mindiv, minh, xn):
def _uchime_denovo(sequences, table, method,
dn, mindiffs, mindiv, minh, xn):
# this function only exists to simplify testing
chimeras = DNAFASTAFormat()
nonchimeras = DNAFASTAFormat()
uchime_stats = UchimeStatsFmt()
with tempfile.NamedTemporaryFile() as fasta_with_sizes:
_fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
cmd = ['vsearch',
'--uchime_denovo', fasta_with_sizes.name,
'--' + method + '_denovo', fasta_with_sizes.name,
'--uchimeout', str(uchime_stats),
'--nonchimeras', str(nonchimeras),
'--chimeras', str(chimeras),
Expand All @@ -102,35 +106,3 @@ def _uchime_denovo(sequences, table, dn, mindiffs, mindiv, minh, xn):
run_command(cmd)

return cmd, chimeras, nonchimeras, uchime_stats


def uchime2_denovo(sequences: DNAFASTAFormat,
table: biom.Table,
dn: float = _uchime_defaults['dn'],
xn: float = _uchime_defaults['xn']) \
-> (DNAFASTAFormat, DNAFASTAFormat, UchimeStatsFmt):
cmd, chimeras, nonchimeras, uchime_stats = \
_uchime2_denovo(sequences, table, dn, xn)
return chimeras, nonchimeras, uchime_stats


def _uchime2_denovo(sequences, table, dn, xn):
# this function only exists to simplify testing
chimeras = DNAFASTAFormat()
nonchimeras = DNAFASTAFormat()
uchime_stats = UchimeStatsFmt()
with tempfile.NamedTemporaryFile() as fasta_with_sizes:
_fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
cmd = ['vsearch',
'--uchime2_denovo', fasta_with_sizes.name,
'--uchimeout', str(uchime_stats),
'--nonchimeras', str(nonchimeras),
'--chimeras', str(chimeras),
'--dn', str(dn),
'--xn', str(xn),
'--qmask', 'none', # ensures no lowercase DNA chars
'--xsize',
'--fasta_width', '0']
run_command(cmd)

return cmd, chimeras, nonchimeras, uchime_stats
66 changes: 19 additions & 47 deletions q2_vsearch/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@
'sequences': FeatureData[Sequence],
'table': FeatureTable[Frequency]},
parameters={
'method': qiime2.plugin.Str % qiime2.plugin.Choices(
['uchime', 'uchime2', 'uchime3']),
'dn': qiime2.plugin.Float % qiime2.plugin.Range(0., None),
'mindiffs': qiime2.plugin.Int % qiime2.plugin.Range(1, None),
'mindiv': qiime2.plugin.Float % qiime2.plugin.Range(0., None),
Expand All @@ -401,12 +403,21 @@
'abundances).'),
},
parameter_descriptions={
'method': ('Which algorithm to use.'),
# 'abskew': ('The abundance skew is used to distinguish in a threeway '
# 'alignment which sequence is the chimera and which are '
# 'the parents. The parent sequences must be this many '
# 'times more abundant than the child sequence to be '
# 'flagged as chimeric.'),
'dn': ('No vote pseudo-count, corresponding to the parameter n in '
'the chimera scoring function.'),
'mindiffs': 'Minimum number of differences per segment.',
'mindiv': 'Minimum divergence from closest parent.',
'mindiffs': 'Minimum number of differences per segment. '
'Ignored for uchime2 and uchime3.',
'mindiv': 'Minimum divergence from closest parent. '
'Ignored for uchime2 and uchime3.',
'minh': ('Minimum score (h). Increasing this value tends to reduce '
'the number of false positives and to decrease sensitivity.'),
'the number of false positives and to decrease sensitivity. '
'Ignored for uchime2 and uchime3.'),
'xn': ('No vote weight, corresponding to the parameter beta in the '
'scoring function.'),
},
Expand All @@ -416,50 +427,11 @@
'stats': 'Summary statistics from chimera checking.'
},
name='De novo chimera filtering.',
description=('Apply the vsearch uchime_denovo method to identify chimeric '
'feature sequences. The results of this method can be used '
'to filter chimeric features from the corresponding feature '
'table. For more details, please refer to the vsearch '
'documentation.')
)

plugin.methods.register_function(
function=q2_vsearch._chimera.uchime2_denovo,
inputs={
'sequences': FeatureData[Sequence],
'table': FeatureTable[Frequency]},
parameters={
'dn': qiime2.plugin.Float % qiime2.plugin.Range(0., None),
'xn': qiime2.plugin.Float % qiime2.plugin.Range(
1., None, inclusive_start=False)
},
outputs=[
('chimeras', FeatureData[Sequence]),
('nonchimeras', FeatureData[Sequence]),
('stats', UchimeStats)
],
input_descriptions={
'sequences': 'The feature sequences to be chimera-checked.',
'table': ('Feature table (used for computing total feature '
'abundances).'),
},
parameter_descriptions={
'dn': ('No vote pseudo-count, corresponding to the parameter n in '
'the chimera scoring function.'),
'xn': ('No vote weight, corresponding to the parameter beta in the '
'scoring function.'),
},
output_descriptions={
'chimeras': 'The chimeric sequences.',
'nonchimeras': 'The non-chimeric sequences.',
'stats': 'Summary statistics from chimera checking.'
},
name='De novo chimera filtering designed for denoised amplicons.',
description=('Apply the vsearch uchime2_denovo method to identify '
'chimeric feature sequences. The results of this method '
'can be used to filter chimeric features from the '
'corresponding feature table. For more details, '
'please refer to the vsearch documentation.')
description=('Apply one of the vsearch uchime*_denovo methods to '
'identify chimeric feature sequences. '
'The results of these methods can be used to filter chimeric '
'features from the corresponding feature table. '
'For more details, please refer to the vsearch manual.')
)


Expand Down
102 changes: 3 additions & 99 deletions q2_vsearch/tests/test_chimera.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
from qiime2.util import redirected_stdio
from q2_types.feature_data import DNAFASTAFormat
from q2_vsearch._chimera import (uchime_denovo, _uchime_denovo,
uchime2_denovo,
# _uchime2_denovo, # see note on line 199
uchime_ref, _uchime_ref)
from q2_vsearch._format import UchimeStatsFmt
from .test_cluster_features import _read_seqs
Expand Down Expand Up @@ -49,6 +47,7 @@ def test_uchime_denovo(self):

obs_chime = _read_seqs(chime)
exp_chime = [self.input_sequences_list[3]]
# >feature4 is the chimera!
self.assertEqual(obs_chime, exp_chime)

# sequences are reverse-sorted by abundance in output
Expand Down Expand Up @@ -107,112 +106,17 @@ def test_uchime_denovo_no_chimeras_alt_params(self):
with redirected_stdio(stderr=os.devnull):
cmd, chime, nonchime, stats = _uchime_denovo(
sequences=self.input_sequences, table=self.input_table,
method='uchime3',
dn=42.42, mindiffs=4, mindiv=0.5, minh=0.42, xn=9.0)
cmd = ' '.join(cmd)
self.assertTrue('--uchime3_denovo' in cmd)
self.assertTrue('--dn 42.42' in cmd)
self.assertTrue('--mindiffs 4' in cmd)
self.assertTrue('--mindiv 0.5' in cmd)
self.assertTrue('--minh 0.42' in cmd)
self.assertTrue('--xn 9.0' in cmd)


class Uchime2DenovoTests(TestPluginBase):

package = 'q2_vsearch.tests'

def setUp(self):
super().setUp()
input_sequences_fp = self.get_data_path('dna-sequences-3.fasta')
self.input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')
self.input_sequences_list = _read_seqs(self.input_sequences)

self.input_table = biom.Table(np.array([[100, 101, 103],
[99, 98, 99],
[4, 5, 6],
[2, 2, 2]]),
['feature1', 'feature2', 'feature3',
'feature4'],
['sample1', 'sample2', 'sample3'])

def test_uchime2_denovo(self):
with redirected_stdio(stderr=os.devnull):
chime, nonchime, stats = uchime2_denovo(
sequences=self.input_sequences, table=self.input_table)

obs_chime = _read_seqs(chime)
# >feature4 is the chimera!
exp_chime = [self.input_sequences_list[3]]
self.assertEqual(obs_chime, exp_chime)

# sequences are reverse-sorted by abundance in output
obs_nonchime = _read_seqs(nonchime)
exp_nonchime = [self.input_sequences_list[0],
self.input_sequences_list[1],
self.input_sequences_list[2]]
# Note how >feature4 is gone!
self.assertEqual(obs_nonchime, exp_nonchime)

with stats.open() as stats_fh:
stats_text = stats_fh.read()
self.assertTrue('feature1' in stats_text)
self.assertTrue('feature2' in stats_text)
self.assertTrue('feature3' in stats_text)
self.assertTrue('feature4' in stats_text)
stats_lines = [e for e in stats_text.split('\n')
if len(e) > 0]
self.assertEqual(len(stats_lines), 4)

def test_uchime2_denovo_no_chimeras(self):
input_table = biom.Table(np.array([[3, 4, 2],
[1, 0, 0],
[4, 5, 6],
[2, 2, 2]]),
['feature1', 'feature2', 'feature3',
'feature4'],
['sample1', 'sample2', 'sample3'])
with redirected_stdio(stderr=os.devnull):
chime, nonchime, stats = uchime2_denovo(
sequences=self.input_sequences, table=input_table)

obs_chime = _read_seqs(chime)
exp_chime = []
self.assertEqual(obs_chime, exp_chime)

# sequences are reverse-sorted by abundance in output
obs_nonchime = _read_seqs(nonchime)
exp_nonchime = [self.input_sequences_list[2],
self.input_sequences_list[0],
self.input_sequences_list[3],
self.input_sequences_list[1]]
self.assertEqual(obs_nonchime, exp_nonchime)

with stats.open() as stats_fh:
stats_text = stats_fh.read()
self.assertTrue('feature1' in stats_text)
self.assertTrue('feature2' in stats_text)
self.assertTrue('feature3' in stats_text)
self.assertTrue('feature4' in stats_text)
stats_lines = [e for e in stats_text.split('\n')
if len(e) > 0]
self.assertEqual(len(stats_lines), 4)

# Is also needed for this flavor of the function?
# Removing it still keeps coverage at 100%
# def test_uchime2_denovo_no_chimeras_alt_params(self):
# with redirected_stdio(stderr=os.devnull):
# cmd, chime, nonchime, stats = _uchime2_denovo(
# sequences=self.input_sequences, table=self.input_table,
# dn=9999.42, xn=9.01)
# cmd = ' '.join(cmd)
# self.assertTrue('--dn 9999.42' in cmd)
# self.assertTrue('--xn 9.01' in cmd)

# obs_chime = _read_seqs(chime)
# # >feature4 is the chimera!
# exp_chime = [self.input_sequences_list[3]]
# self.assertEqual(obs_chime, exp_chime)


class UchimeRefTests(TestPluginBase):

package = 'q2_vsearch.tests'
Expand Down

0 comments on commit 690f18e

Please sign in to comment.