From 693d1c261dba5c84c68f9526dca9710bda840605 Mon Sep 17 00:00:00 2001
From: Valentin Ambroise <113367796+vaamb@users.noreply.github.com>
Date: Fri, 13 Oct 2023 21:26:45 +0200
Subject: [PATCH] NEW: Exposes `strand` parameter for
 `cluster_features_de_novo` (#90)

---
 q2_vsearch/_cluster_features.py             |  7 ++-
 q2_vsearch/plugin_setup.py                  |  3 ++
 q2_vsearch/tests/data/dna-sequences-4.fasta |  8 +++
 q2_vsearch/tests/test_cluster_features.py   | 60 +++++++++++++++++++--
 4 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 q2_vsearch/tests/data/dna-sequences-4.fasta

diff --git a/q2_vsearch/_cluster_features.py b/q2_vsearch/_cluster_features.py
index fa31f65..ee57f2c 100644
--- a/q2_vsearch/_cluster_features.py
+++ b/q2_vsearch/_cluster_features.py
@@ -182,7 +182,9 @@ def _fasta_with_sizes(input_fasta_fp, output_fasta_fp, table):
 
 
 def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table,
-                             perc_identity: float, threads: int = 1
+                             perc_identity: float,
+                             strand: str = 'plus',
+                             threads: int = 1
                              ) -> (biom.Table, DNAFASTAFormat):
     clustered_sequences = DNAFASTAFormat()
     with tempfile.NamedTemporaryFile() as fasta_with_sizes:
@@ -193,6 +195,7 @@ def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table,
                    '--id', str(perc_identity),
                    '--centroids', str(clustered_sequences),
                    '--uc', out_uc.name,
+                   '--strand', str(strand),
                    '--qmask', 'none',  # ensures no lowercase DNA chars
                    '--xsize',
                    '--threads', str(threads),
@@ -347,7 +350,7 @@ def cluster_features_open_reference(ctx, sequences, table, reference_sequences,
 
         de_novo_table, de_novo_seqs = cluster_features_de_novo(
             sequences=unmatched_seqs, table=unmatched_table,
-            perc_identity=perc_identity, threads=threads)
+            perc_identity=perc_identity, strand=strand, threads=threads)
 
         if skipped_closed_ref:
             merged_reference_seqs, = merge_seqs(data=[reference_sequences,
diff --git a/q2_vsearch/plugin_setup.py b/q2_vsearch/plugin_setup.py
index c8e1b85..f2539ec 100644
--- a/q2_vsearch/plugin_setup.py
+++ b/q2_vsearch/plugin_setup.py
@@ -53,6 +53,7 @@
     parameters={
         'perc_identity': qiime2.plugin.Float % qiime2.plugin.Range(
                           0, 1, inclusive_start=False, inclusive_end=True),
+        'strand': qiime2.plugin.Str % qiime2.plugin.Choices(['plus', 'both']),
         'threads': qiime2.plugin.Int % qiime2.plugin.Range(
                           0, 256, inclusive_start=True, inclusive_end=True)
     },
@@ -68,6 +69,8 @@
         'perc_identity': ('The percent identity at which clustering should be '
                           'performed. This parameter maps to vsearch\'s --id '
                           'parameter.'),
+        'strand': ('Search plus (i.e., forward) or both (i.e., forward and '
+                   'reverse complement) strands.'),
         'threads': ('The number of threads to use for computation. Passing 0 '
                     'will launch one thread per CPU core.')
     },
diff --git a/q2_vsearch/tests/data/dna-sequences-4.fasta b/q2_vsearch/tests/data/dna-sequences-4.fasta
new file mode 100644
index 0000000..8ec9b37
--- /dev/null
+++ b/q2_vsearch/tests/data/dna-sequences-4.fasta
@@ -0,0 +1,8 @@
+>feature1
+AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAAC
+>feature2
+ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAACACGTAC
+>feature3
+GTTATTTACTTTCCTATTTCACTAGGGGGGGTGGCGTTACTTGGACACCGGTCGCACCATGCCGATTAGTCTTCTGCATGTATAGTTAACCGTAACGTTC
+>feature4
+GGGCGTTACGGTTAACTATACATGCAGAAGACTAATCGGCATGGTGCGACCGGTGTCCAAGTAACGCCACCCCCCCTAGTGAAATAGGAAAGTAAATAAC
diff --git a/q2_vsearch/tests/test_cluster_features.py b/q2_vsearch/tests/test_cluster_features.py
index 6c96806..a617e65 100644
--- a/q2_vsearch/tests/test_cluster_features.py
+++ b/q2_vsearch/tests/test_cluster_features.py
@@ -198,6 +198,60 @@ def test_1_percent_clustering(self):
         exp_seqs = [self.input_sequences_list[0]]
         self.assertEqual(obs_seqs, exp_seqs)
 
+    def test_99_percent_clustering_strand_no_clustering(self):
+        # Feature3 of dna-sequences-4 if the reverse comp of Feature3 of
+        #  dna-sequences-1 and should only cluster if strand is set to 'both'
+        input_sequences_fp = self.get_data_path('dna-sequences-4.fasta')
+        input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_table, obs_sequences = cluster_features_de_novo(
+                sequences=input_sequences, table=self.input_table,
+                perc_identity=0.99, strand='plus')
+        # order of identifiers is important for biom.Table equality
+        obs_table = \
+            obs_table.sort_order(self.input_table.ids(axis='observation'),
+                                 axis='observation')
+        self.assertEqual(obs_table, self.input_table)
+
+        input_sequences_list = _read_seqs(input_sequences)
+        # sequences are reverse-sorted by abundance in output
+        obs_seqs = _read_seqs(obs_sequences)
+        exp_seqs = [input_sequences_list[0], input_sequences_list[3],
+                    input_sequences_list[2], input_sequences_list[1]]
+        self.assertEqual(obs_seqs, exp_seqs)
+
+    def test_99_percent_clustering_strand(self):
+        # Feature3 of dna-sequences-4 if the reverse comp of Feature3 of
+        #  dna-sequences-1 and should only cluster if strand is set to 'both'
+        input_sequences_fp = self.get_data_path('dna-sequences-4.fasta')
+        input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')
+
+        exp_table = biom.Table(np.array([[104, 106, 109],
+                                         [1, 1, 2],
+                                         [7, 8, 9]]),
+                               ['feature1', 'feature2',
+                                'feature4'],
+                               ['sample1', 'sample2', 'sample3'])
+        with redirected_stdio(stderr=os.devnull):
+            obs_table, obs_sequences = cluster_features_de_novo(
+                sequences=input_sequences, table=self.input_table,
+                perc_identity=0.99, strand='both')
+
+        # order of identifiers is important for biom.Table equality
+        obs_table = \
+            obs_table.sort_order(exp_table.ids(axis='observation'),
+                                 axis='observation')
+
+        self.assertEqual(obs_table, exp_table)
+
+        input_sequences_list = _read_seqs(input_sequences)
+        # sequences are reverse-sorted by abundance in output
+        obs_seqs = _read_seqs(obs_sequences)
+        exp_seqs = [input_sequences_list[0], input_sequences_list[3],
+                    input_sequences_list[1]]
+        self.assertEqual(obs_seqs, exp_seqs)
+
     def test_short_sequences(self):
         input_sequences_fp = self.get_data_path('dna-sequences-short.fasta')
         input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')
@@ -763,9 +817,9 @@ def test_skip_denovo(self):
         self.assertEqual(obs_ref_seqs, exp_ref_seqs)
 
     def test_skip_closed_reference(self):
-        # feature1 and feature3 clusters into r1 and feature2 and feature4
-        # clusters into r2 during closed-ref clustering; no unclustered
-        # features so de-novo clustering is skipped.
+        # self.ref_sequences_2 are rev comps of self.ref_sequences_1. As
+        # strand='both' is not passed, there will not be any clustering during
+        # closed-reference clustering
 
         exp_table = biom.Table(np.array([[100, 101, 103],
                                          [1, 1, 2],