Skip to content

Commit

Permalink
assembly.assemble_spades: added option to drop contigs shorter than g…
Browse files Browse the repository at this point in the history
…iven length. (#889)

assembly.assemble_spades: added option to drop contigs shorter than given length (for #874)
  • Loading branch information
notestaff authored Oct 22, 2018
1 parent c7f6c12 commit fbf6003
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 4 deletions.
4 changes: 4 additions & 0 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ def assemble_spades(
spades_opts='',
contigs_trusted=None, contigs_untrusted=None,
filter_contigs=False,
min_contig_len=0,
kmer_sizes=(55,65),
n_reads=10000000,
outReads=None,
Expand All @@ -377,6 +378,7 @@ def assemble_spades(
tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired,
contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted,
contigs_out=out_fasta, filter_contigs=filter_contigs,
min_contig_len=min_contig_len,
kmer_sizes=kmer_sizes, mask_errors=mask_errors, max_kmer_sizes=max_kmer_sizes,
spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
threads=threads)
Expand All @@ -401,6 +403,8 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()):
parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true',
help='only output contigs SPAdes is sure of (drop lesser-quality contigs from output)')
parser.add_argument('--minContigLen', dest='min_contig_len', type=int, default=0,
help='only output contigs longer than this many bp')
parser.add_argument('--spadesOpts', dest='spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler')
parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
Expand Down
3 changes: 3 additions & 0 deletions pipes/WDL/workflows/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ task assemble {

Int? trinity_n_reads=250000
Int? spades_n_reads=10000000
Int? spades_min_contig_len=0

String? assembler="trinity" # trinity, spades, or trinity-spades
Expand Down Expand Up @@ -35,6 +36,7 @@ task assemble {
${trim_clip_db} \
${sample_name}.assembly1-spades.fasta \
${'--nReads=' + spades_n_reads} \
${'--minContigLen=' + spades_min_contig_len} \
--memLimitGb $mem_in_gb \
--outReads=${sample_name}.subsamp.bam \
--loglevel=DEBUG
Expand All @@ -54,6 +56,7 @@ task assemble {
${sample_name}.assembly1-spades.fasta \
--contigsUntrusted=${sample_name}.assembly1-trinity.fasta \
${'--nReads=' + spades_n_reads} \
${'--minContigLen=' + spades_min_contig_len} \
--memLimitGb $mem_in_gb \
--loglevel=DEBUG

Expand Down
4 changes: 2 additions & 2 deletions test/unit/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,12 @@ def test_assembly(self):
inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam')
clipDb = os.path.join(inDir, 'clipDb.fasta')
with util.file.tempfname('.fasta') as outFasta:
assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta)
assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, min_contig_len=180, out_fasta=outFasta)
self.assertGreater(os.path.getsize(outFasta), 0)
contig_lens = list(sorted(len(seq.seq) for seq in Bio.SeqIO.parse(outFasta, 'fasta')))
#import sys
#print('test_assembly_contigs_lens:', contig_lens, file=sys.stderr)
self.assertEqual(contig_lens, [168, 170, 177, 180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430])
self.assertEqual(contig_lens, [180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430])

def test_assembly_with_previously_assembled_contigs(self):
inDir = util.file.get_test_input_path(self)
Expand Down
26 changes: 24 additions & 2 deletions tools/spades.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import shlex
import tempfile

import Bio.SeqIO

import tools
import tools.samtools
import tools.picard
Expand Down Expand Up @@ -41,7 +43,7 @@ def execute(self, args): # pylint: disable=W0221

def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None,
contigs_untrusted=None, kmer_sizes=(55,65), mask_errors=False, max_kmer_sizes=1,
filter_contigs=False, mem_limit_gb=8, threads=None, spades_opts=''):
filter_contigs=False, min_contig_len=0, mem_limit_gb=8, threads=None, spades_opts=''):
'''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs.
Inputs:
Expand All @@ -57,6 +59,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
mask_errors: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size
max_kmer_sizes: if this many kmer sizes succeed, do not try further ones
filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs)
min_contig_len: drop contigs shorter than this many bp
mem_limit_gb: max memory to use, in gigabytes
threads: number of threads to use
spades_opts: additional options to pass to spades
Expand All @@ -72,6 +75,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
threads = util.misc.sanitize_thread_count(threads)

util.file.make_empty(contigs_out)
contigs_cumul_count = 0

if ((reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0) or
(reads_unpaired and os.path.getsize(reads_unpaired) > 0)):
Expand Down Expand Up @@ -103,8 +107,26 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
else:
raise

util.file.concat(transcripts_fname, contigs_out, append=True)
if min_contig_len:
transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta')
transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]
transcripts_fname = os.path.join(spades_dir, 'transcripts_over_{}bp.fasta'.format(min_contig_len))
Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta')

contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count))
contigs_cumul_count += 1

util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True)
shutil.copyfile(contigs_cumul, contigs_out)

if os.path.getsize(transcripts_fname):
kmer_sizes_succeeded += 1
if kmer_sizes_succeeded >= max_kmer_sizes:
break
# end: with util.file.tmp_dir('_spades') as spades_dir
# end: for kmer_size in util.misc.make_seq(kmer_sizes)
# if input non-empty
# end: def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, ...)
# end: class SpadesTool(tools.Tool)


0 comments on commit fbf6003

Please sign in to comment.