assembly.assemble_spades: added option to drop contigs shorter than g…

…iven length. (#889) assembly.assemble_spades: added option to drop contigs shorter than given length (for #874)
broadinstitute · Oct 22, 2018 · fbf6003 · fbf6003
1 parent c7f6c12
commit fbf6003
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 4 deletions.
diff --git a/assembly.py b/assembly.py
@@ -352,6 +352,7 @@ def assemble_spades(
     spades_opts='',
     contigs_trusted=None, contigs_untrusted=None,
     filter_contigs=False,
+    min_contig_len=0,
     kmer_sizes=(55,65),
     n_reads=10000000,
     outReads=None,
@@ -377,6 +378,7 @@ def assemble_spades(
             tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired,
                                                contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted,
                                                contigs_out=out_fasta, filter_contigs=filter_contigs,
+                                               min_contig_len=min_contig_len,
                                                kmer_sizes=kmer_sizes, mask_errors=mask_errors, max_kmer_sizes=max_kmer_sizes,
                                                spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
                                                threads=threads)
@@ -401,6 +403,8 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()):
     parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
     parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true', 
                         help='only output contigs SPAdes is sure of (drop lesser-quality contigs from output)')
+    parser.add_argument('--minContigLen', dest='min_contig_len', type=int, default=0,
+                        help='only output contigs longer than this many bp')
     parser.add_argument('--spadesOpts', dest='spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler')
     parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
     util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))

diff --git a/pipes/WDL/workflows/tasks/tasks_assembly.wdl b/pipes/WDL/workflows/tasks/tasks_assembly.wdl
@@ -4,6 +4,7 @@ task assemble {
 
     Int?    trinity_n_reads=250000
     Int?    spades_n_reads=10000000
+    Int?    spades_min_contig_len=0
 
     String? assembler="trinity"  # trinity, spades, or trinity-spades
 
@@ -35,6 +36,7 @@ task assemble {
             ${trim_clip_db} \
             ${sample_name}.assembly1-spades.fasta \
             ${'--nReads=' + spades_n_reads} \
+            ${'--minContigLen=' + spades_min_contig_len} \
             --memLimitGb $mem_in_gb \
             --outReads=${sample_name}.subsamp.bam \
             --loglevel=DEBUG
@@ -54,6 +56,7 @@ task assemble {
             ${sample_name}.assembly1-spades.fasta \
             --contigsUntrusted=${sample_name}.assembly1-trinity.fasta \
             ${'--nReads=' + spades_n_reads} \
+            ${'--minContigLen=' + spades_min_contig_len} \
             --memLimitGb $mem_in_gb \
             --loglevel=DEBUG
 

diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py
@@ -154,12 +154,12 @@ def test_assembly(self):
         inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam')
         clipDb = os.path.join(inDir, 'clipDb.fasta')
         with util.file.tempfname('.fasta') as outFasta:
-            assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta)
+            assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, min_contig_len=180, out_fasta=outFasta)
             self.assertGreater(os.path.getsize(outFasta), 0)
             contig_lens = list(sorted(len(seq.seq) for seq in Bio.SeqIO.parse(outFasta, 'fasta')))
             #import sys
             #print('test_assembly_contigs_lens:', contig_lens, file=sys.stderr)
-            self.assertEqual(contig_lens, [168, 170, 177, 180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430])
+            self.assertEqual(contig_lens, [180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430])
 
     def test_assembly_with_previously_assembled_contigs(self):
         inDir = util.file.get_test_input_path(self)

diff --git a/tools/spades.py b/tools/spades.py
@@ -11,6 +11,8 @@
 import shlex
 import tempfile
 
+import Bio.SeqIO
+
 import tools
 import tools.samtools
 import tools.picard
@@ -41,7 +43,7 @@ def execute(self, args):    # pylint: disable=W0221
 
     def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None,
                  contigs_untrusted=None, kmer_sizes=(55,65), mask_errors=False, max_kmer_sizes=1, 
-                 filter_contigs=False, mem_limit_gb=8, threads=None, spades_opts=''):
+                 filter_contigs=False, min_contig_len=0, mem_limit_gb=8, threads=None, spades_opts=''):
         '''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs.
 
         Inputs:
@@ -57,6 +59,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
             mask_errors: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size
             max_kmer_sizes: if this many kmer sizes succeed, do not try further ones
             filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs)
+            min_contig_len: drop contigs shorter than this many bp
             mem_limit_gb: max memory to use, in gigabytes
             threads: number of threads to use
             spades_opts: additional options to pass to spades
@@ -72,6 +75,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
         threads = util.misc.sanitize_thread_count(threads)
 
         util.file.make_empty(contigs_out)
+        contigs_cumul_count = 0
 
         if ((reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0) or
             (reads_unpaired and os.path.getsize(reads_unpaired) > 0)):
@@ -103,8 +107,26 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
                         else:
                             raise
 
-                    util.file.concat(transcripts_fname, contigs_out, append=True)
+                    if min_contig_len:
+                        transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta')
+                        transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]
+                        transcripts_fname = os.path.join(spades_dir, 'transcripts_over_{}bp.fasta'.format(min_contig_len))
+                        Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta')
+
+                    contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count))
+                    contigs_cumul_count += 1
+
+                    util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True)
+                    shutil.copyfile(contigs_cumul, contigs_out)
+
                     if os.path.getsize(transcripts_fname):
                         kmer_sizes_succeeded += 1
                         if kmer_sizes_succeeded >= max_kmer_sizes:
                             break
+                # end: with util.file.tmp_dir('_spades') as spades_dir
+            # end: for kmer_size in util.misc.make_seq(kmer_sizes)
+        # if input non-empty
+    # end: def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, ...)
+# end: class SpadesTool(tools.Tool)
+
+