Skip to content

Commit

Permalink
NEW: Exposes strand parameter for cluster_features_de_novo (#90)
Browse files Browse the repository at this point in the history
  • Loading branch information
vaamb authored Oct 13, 2023
1 parent dd26404 commit 693d1c2
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 5 deletions.
7 changes: 5 additions & 2 deletions q2_vsearch/_cluster_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,9 @@ def _fasta_with_sizes(input_fasta_fp, output_fasta_fp, table):


def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table,
perc_identity: float, threads: int = 1
perc_identity: float,
strand: str = 'plus',
threads: int = 1
) -> (biom.Table, DNAFASTAFormat):
clustered_sequences = DNAFASTAFormat()
with tempfile.NamedTemporaryFile() as fasta_with_sizes:
Expand All @@ -193,6 +195,7 @@ def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table,
'--id', str(perc_identity),
'--centroids', str(clustered_sequences),
'--uc', out_uc.name,
'--strand', str(strand),
'--qmask', 'none', # ensures no lowercase DNA chars
'--xsize',
'--threads', str(threads),
Expand Down Expand Up @@ -347,7 +350,7 @@ def cluster_features_open_reference(ctx, sequences, table, reference_sequences,

de_novo_table, de_novo_seqs = cluster_features_de_novo(
sequences=unmatched_seqs, table=unmatched_table,
perc_identity=perc_identity, threads=threads)
perc_identity=perc_identity, strand=strand, threads=threads)

if skipped_closed_ref:
merged_reference_seqs, = merge_seqs(data=[reference_sequences,
Expand Down
3 changes: 3 additions & 0 deletions q2_vsearch/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
parameters={
'perc_identity': qiime2.plugin.Float % qiime2.plugin.Range(
0, 1, inclusive_start=False, inclusive_end=True),
'strand': qiime2.plugin.Str % qiime2.plugin.Choices(['plus', 'both']),
'threads': qiime2.plugin.Int % qiime2.plugin.Range(
0, 256, inclusive_start=True, inclusive_end=True)
},
Expand All @@ -68,6 +69,8 @@
'perc_identity': ('The percent identity at which clustering should be '
'performed. This parameter maps to vsearch\'s --id '
'parameter.'),
'strand': ('Search plus (i.e., forward) or both (i.e., forward and '
'reverse complement) strands.'),
'threads': ('The number of threads to use for computation. Passing 0 '
'will launch one thread per CPU core.')
},
Expand Down
8 changes: 8 additions & 0 deletions q2_vsearch/tests/data/dna-sequences-4.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>feature1
AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAAC
>feature2
ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAACACGTAC
>feature3
GTTATTTACTTTCCTATTTCACTAGGGGGGGTGGCGTTACTTGGACACCGGTCGCACCATGCCGATTAGTCTTCTGCATGTATAGTTAACCGTAACGTTC
>feature4
GGGCGTTACGGTTAACTATACATGCAGAAGACTAATCGGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAAC
60 changes: 57 additions & 3 deletions q2_vsearch/tests/test_cluster_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,60 @@ def test_1_percent_clustering(self):
exp_seqs = [self.input_sequences_list[0]]
self.assertEqual(obs_seqs, exp_seqs)

def test_99_percent_clustering_strand_no_clustering(self):
# Feature3 of dna-sequences-4 if the reverse comp of Feature3 of
# dna-sequences-1 and should only cluster if strand is set to 'both'
input_sequences_fp = self.get_data_path('dna-sequences-4.fasta')
input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')

with redirected_stdio(stderr=os.devnull):
obs_table, obs_sequences = cluster_features_de_novo(
sequences=input_sequences, table=self.input_table,
perc_identity=0.99, strand='plus')
# order of identifiers is important for biom.Table equality
obs_table = \
obs_table.sort_order(self.input_table.ids(axis='observation'),
axis='observation')
self.assertEqual(obs_table, self.input_table)

input_sequences_list = _read_seqs(input_sequences)
# sequences are reverse-sorted by abundance in output
obs_seqs = _read_seqs(obs_sequences)
exp_seqs = [input_sequences_list[0], input_sequences_list[3],
input_sequences_list[2], input_sequences_list[1]]
self.assertEqual(obs_seqs, exp_seqs)

def test_99_percent_clustering_strand(self):
# Feature3 of dna-sequences-4 if the reverse comp of Feature3 of
# dna-sequences-1 and should only cluster if strand is set to 'both'
input_sequences_fp = self.get_data_path('dna-sequences-4.fasta')
input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')

exp_table = biom.Table(np.array([[104, 106, 109],
[1, 1, 2],
[7, 8, 9]]),
['feature1', 'feature2',
'feature4'],
['sample1', 'sample2', 'sample3'])
with redirected_stdio(stderr=os.devnull):
obs_table, obs_sequences = cluster_features_de_novo(
sequences=input_sequences, table=self.input_table,
perc_identity=0.99, strand='both')

# order of identifiers is important for biom.Table equality
obs_table = \
obs_table.sort_order(exp_table.ids(axis='observation'),
axis='observation')

self.assertEqual(obs_table, exp_table)

input_sequences_list = _read_seqs(input_sequences)
# sequences are reverse-sorted by abundance in output
obs_seqs = _read_seqs(obs_sequences)
exp_seqs = [input_sequences_list[0], input_sequences_list[3],
input_sequences_list[1]]
self.assertEqual(obs_seqs, exp_seqs)

def test_short_sequences(self):
input_sequences_fp = self.get_data_path('dna-sequences-short.fasta')
input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')
Expand Down Expand Up @@ -763,9 +817,9 @@ def test_skip_denovo(self):
self.assertEqual(obs_ref_seqs, exp_ref_seqs)

def test_skip_closed_reference(self):
# feature1 and feature3 clusters into r1 and feature2 and feature4
# clusters into r2 during closed-ref clustering; no unclustered
# features so de-novo clustering is skipped.
# self.ref_sequences_2 are rev comps of self.ref_sequences_1. As
# strand='both' is not passed, there will not be any clustering during
# closed-reference clustering

exp_table = biom.Table(np.array([[100, 101, 103],
[1, 1, 2],
Expand Down

0 comments on commit 693d1c2

Please sign in to comment.