From 693d1c261dba5c84c68f9526dca9710bda840605 Mon Sep 17 00:00:00 2001 From: Valentin Ambroise <113367796+vaamb@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:26:45 +0200 Subject: [PATCH] NEW: Exposes `strand` parameter for `cluster_features_de_novo` (#90) --- q2_vsearch/_cluster_features.py | 7 ++- q2_vsearch/plugin_setup.py | 3 ++ q2_vsearch/tests/data/dna-sequences-4.fasta | 8 +++ q2_vsearch/tests/test_cluster_features.py | 60 +++++++++++++++++++-- 4 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 q2_vsearch/tests/data/dna-sequences-4.fasta diff --git a/q2_vsearch/_cluster_features.py b/q2_vsearch/_cluster_features.py index fa31f65..ee57f2c 100644 --- a/q2_vsearch/_cluster_features.py +++ b/q2_vsearch/_cluster_features.py @@ -182,7 +182,9 @@ def _fasta_with_sizes(input_fasta_fp, output_fasta_fp, table): def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table, - perc_identity: float, threads: int = 1 + perc_identity: float, + strand: str = 'plus', + threads: int = 1 ) -> (biom.Table, DNAFASTAFormat): clustered_sequences = DNAFASTAFormat() with tempfile.NamedTemporaryFile() as fasta_with_sizes: @@ -193,6 +195,7 @@ def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table, '--id', str(perc_identity), '--centroids', str(clustered_sequences), '--uc', out_uc.name, + '--strand', str(strand), '--qmask', 'none', # ensures no lowercase DNA chars '--xsize', '--threads', str(threads), @@ -347,7 +350,7 @@ def cluster_features_open_reference(ctx, sequences, table, reference_sequences, de_novo_table, de_novo_seqs = cluster_features_de_novo( sequences=unmatched_seqs, table=unmatched_table, - perc_identity=perc_identity, threads=threads) + perc_identity=perc_identity, strand=strand, threads=threads) if skipped_closed_ref: merged_reference_seqs, = merge_seqs(data=[reference_sequences, diff --git a/q2_vsearch/plugin_setup.py b/q2_vsearch/plugin_setup.py index c8e1b85..f2539ec 100644 --- a/q2_vsearch/plugin_setup.py +++ b/q2_vsearch/plugin_setup.py @@ -53,6 +53,7 @@ parameters={ 'perc_identity': qiime2.plugin.Float % qiime2.plugin.Range( 0, 1, inclusive_start=False, inclusive_end=True), + 'strand': qiime2.plugin.Str % qiime2.plugin.Choices(['plus', 'both']), 'threads': qiime2.plugin.Int % qiime2.plugin.Range( 0, 256, inclusive_start=True, inclusive_end=True) }, @@ -68,6 +69,8 @@ 'perc_identity': ('The percent identity at which clustering should be ' 'performed. This parameter maps to vsearch\'s --id ' 'parameter.'), + 'strand': ('Search plus (i.e., forward) or both (i.e., forward and ' + 'reverse complement) strands.'), 'threads': ('The number of threads to use for computation. Passing 0 ' 'will launch one thread per CPU core.') }, diff --git a/q2_vsearch/tests/data/dna-sequences-4.fasta b/q2_vsearch/tests/data/dna-sequences-4.fasta new file mode 100644 index 0000000..8ec9b37 --- /dev/null +++ b/q2_vsearch/tests/data/dna-sequences-4.fasta @@ -0,0 +1,8 @@ +>feature1 +AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAAC +>feature2 +ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAACACGTAC +>feature3 +GTTATTTACTTTCCTATTTCACTAGGGGGGGTGGCGTTACTTGGACACCGGTCGCACCATGCCGATTAGTCTTCTGCATGTATAGTTAACCGTAACGTTC +>feature4 +GGGCGTTACGGTTAACTATACATGCAGAAGACTAATCGGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAAC diff --git a/q2_vsearch/tests/test_cluster_features.py b/q2_vsearch/tests/test_cluster_features.py index 6c96806..a617e65 100644 --- a/q2_vsearch/tests/test_cluster_features.py +++ b/q2_vsearch/tests/test_cluster_features.py @@ -198,6 +198,60 @@ def test_1_percent_clustering(self): exp_seqs = [self.input_sequences_list[0]] self.assertEqual(obs_seqs, exp_seqs) + def test_99_percent_clustering_strand_no_clustering(self): + # Feature3 of dna-sequences-4 if the reverse comp of Feature3 of + # dna-sequences-1 and should only cluster if strand is set to 'both' + input_sequences_fp = self.get_data_path('dna-sequences-4.fasta') + input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r') + + with redirected_stdio(stderr=os.devnull): + obs_table, obs_sequences = cluster_features_de_novo( + sequences=input_sequences, table=self.input_table, + perc_identity=0.99, strand='plus') + # order of identifiers is important for biom.Table equality + obs_table = \ + obs_table.sort_order(self.input_table.ids(axis='observation'), + axis='observation') + self.assertEqual(obs_table, self.input_table) + + input_sequences_list = _read_seqs(input_sequences) + # sequences are reverse-sorted by abundance in output + obs_seqs = _read_seqs(obs_sequences) + exp_seqs = [input_sequences_list[0], input_sequences_list[3], + input_sequences_list[2], input_sequences_list[1]] + self.assertEqual(obs_seqs, exp_seqs) + + def test_99_percent_clustering_strand(self): + # Feature3 of dna-sequences-4 if the reverse comp of Feature3 of + # dna-sequences-1 and should only cluster if strand is set to 'both' + input_sequences_fp = self.get_data_path('dna-sequences-4.fasta') + input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r') + + exp_table = biom.Table(np.array([[104, 106, 109], + [1, 1, 2], + [7, 8, 9]]), + ['feature1', 'feature2', + 'feature4'], + ['sample1', 'sample2', 'sample3']) + with redirected_stdio(stderr=os.devnull): + obs_table, obs_sequences = cluster_features_de_novo( + sequences=input_sequences, table=self.input_table, + perc_identity=0.99, strand='both') + + # order of identifiers is important for biom.Table equality + obs_table = \ + obs_table.sort_order(exp_table.ids(axis='observation'), + axis='observation') + + self.assertEqual(obs_table, exp_table) + + input_sequences_list = _read_seqs(input_sequences) + # sequences are reverse-sorted by abundance in output + obs_seqs = _read_seqs(obs_sequences) + exp_seqs = [input_sequences_list[0], input_sequences_list[3], + input_sequences_list[1]] + self.assertEqual(obs_seqs, exp_seqs) + def test_short_sequences(self): input_sequences_fp = self.get_data_path('dna-sequences-short.fasta') input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r') @@ -763,9 +817,9 @@ def test_skip_denovo(self): self.assertEqual(obs_ref_seqs, exp_ref_seqs) def test_skip_closed_reference(self): - # feature1 and feature3 clusters into r1 and feature2 and feature4 - # clusters into r2 during closed-ref clustering; no unclustered - # features so de-novo clustering is skipped. + # self.ref_sequences_2 are rev comps of self.ref_sequences_1. As + # strand='both' is not passed, there will not be any clustering during + # closed-reference clustering exp_table = biom.Table(np.array([[100, 101, 103], [1, 1, 2],