From fbf60039690080bb43b665da24fbec3118160aa9 Mon Sep 17 00:00:00 2001 From: Ilya Shlyakhter Date: Mon, 22 Oct 2018 17:00:07 -0400 Subject: [PATCH] assembly.assemble_spades: added option to drop contigs shorter than given length. (#889) assembly.assemble_spades: added option to drop contigs shorter than given length (for #874) --- assembly.py | 4 +++ pipes/WDL/workflows/tasks/tasks_assembly.wdl | 3 +++ test/unit/test_assembly.py | 4 +-- tools/spades.py | 26 ++++++++++++++++++-- 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/assembly.py b/assembly.py index 2f98af2ce..ad03bffb1 100755 --- a/assembly.py +++ b/assembly.py @@ -352,6 +352,7 @@ def assemble_spades( spades_opts='', contigs_trusted=None, contigs_untrusted=None, filter_contigs=False, + min_contig_len=0, kmer_sizes=(55,65), n_reads=10000000, outReads=None, @@ -377,6 +378,7 @@ def assemble_spades( tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired, contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted, contigs_out=out_fasta, filter_contigs=filter_contigs, + min_contig_len=min_contig_len, kmer_sizes=kmer_sizes, mask_errors=mask_errors, max_kmer_sizes=max_kmer_sizes, spades_opts=spades_opts, mem_limit_gb=mem_limit_gb, threads=threads) @@ -401,6 +403,8 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()): parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file') parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true', help='only output contigs SPAdes is sure of (drop lesser-quality contigs from output)') + parser.add_argument('--minContigLen', dest='min_contig_len', type=int, default=0, + help='only output contigs longer than this many bp') parser.add_argument('--spadesOpts', dest='spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler') parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)') util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None))) diff --git a/pipes/WDL/workflows/tasks/tasks_assembly.wdl b/pipes/WDL/workflows/tasks/tasks_assembly.wdl index f83227217..17080a21f 100644 --- a/pipes/WDL/workflows/tasks/tasks_assembly.wdl +++ b/pipes/WDL/workflows/tasks/tasks_assembly.wdl @@ -4,6 +4,7 @@ task assemble { Int? trinity_n_reads=250000 Int? spades_n_reads=10000000 + Int? spades_min_contig_len=0 String? assembler="trinity" # trinity, spades, or trinity-spades @@ -35,6 +36,7 @@ task assemble { ${trim_clip_db} \ ${sample_name}.assembly1-spades.fasta \ ${'--nReads=' + spades_n_reads} \ + ${'--minContigLen=' + spades_min_contig_len} \ --memLimitGb $mem_in_gb \ --outReads=${sample_name}.subsamp.bam \ --loglevel=DEBUG @@ -54,6 +56,7 @@ task assemble { ${sample_name}.assembly1-spades.fasta \ --contigsUntrusted=${sample_name}.assembly1-trinity.fasta \ ${'--nReads=' + spades_n_reads} \ + ${'--minContigLen=' + spades_min_contig_len} \ --memLimitGb $mem_in_gb \ --loglevel=DEBUG diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 857cc4d4b..5fddb1e38 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -154,12 +154,12 @@ def test_assembly(self): inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam') clipDb = os.path.join(inDir, 'clipDb.fasta') with util.file.tempfname('.fasta') as outFasta: - assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta) + assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, min_contig_len=180, out_fasta=outFasta) self.assertGreater(os.path.getsize(outFasta), 0) contig_lens = list(sorted(len(seq.seq) for seq in Bio.SeqIO.parse(outFasta, 'fasta'))) #import sys #print('test_assembly_contigs_lens:', contig_lens, file=sys.stderr) - self.assertEqual(contig_lens, [168, 170, 177, 180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430]) + self.assertEqual(contig_lens, [180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430]) def test_assembly_with_previously_assembled_contigs(self): inDir = util.file.get_test_input_path(self) diff --git a/tools/spades.py b/tools/spades.py index 93726414c..f7981e527 100644 --- a/tools/spades.py +++ b/tools/spades.py @@ -11,6 +11,8 @@ import shlex import tempfile +import Bio.SeqIO + import tools import tools.samtools import tools.picard @@ -41,7 +43,7 @@ def execute(self, args): # pylint: disable=W0221 def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, contigs_untrusted=None, kmer_sizes=(55,65), mask_errors=False, max_kmer_sizes=1, - filter_contigs=False, mem_limit_gb=8, threads=None, spades_opts=''): + filter_contigs=False, min_contig_len=0, mem_limit_gb=8, threads=None, spades_opts=''): '''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs. Inputs: @@ -57,6 +59,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti mask_errors: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size max_kmer_sizes: if this many kmer sizes succeed, do not try further ones filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs) + min_contig_len: drop contigs shorter than this many bp mem_limit_gb: max memory to use, in gigabytes threads: number of threads to use spades_opts: additional options to pass to spades @@ -72,6 +75,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti threads = util.misc.sanitize_thread_count(threads) util.file.make_empty(contigs_out) + contigs_cumul_count = 0 if ((reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0) or (reads_unpaired and os.path.getsize(reads_unpaired) > 0)): @@ -103,8 +107,26 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti else: raise - util.file.concat(transcripts_fname, contigs_out, append=True) + if min_contig_len: + transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta') + transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len] + transcripts_fname = os.path.join(spades_dir, 'transcripts_over_{}bp.fasta'.format(min_contig_len)) + Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta') + + contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count)) + contigs_cumul_count += 1 + + util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True) + shutil.copyfile(contigs_cumul, contigs_out) + if os.path.getsize(transcripts_fname): kmer_sizes_succeeded += 1 if kmer_sizes_succeeded >= max_kmer_sizes: break + # end: with util.file.tmp_dir('_spades') as spades_dir + # end: for kmer_size in util.misc.make_seq(kmer_sizes) + # if input non-empty + # end: def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, ...) +# end: class SpadesTool(tools.Tool) + +