Skip to content

Commit

Permalink
Merge pull request #86 from jaleezyy/composite_ref
Browse files Browse the repository at this point in the history
Composite reference host removal
  • Loading branch information
fmaguire authored Jun 25, 2020
2 parents f7c6f65 + 96f1c75 commit bdaee56
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 58 deletions.
64 changes: 21 additions & 43 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -182,57 +182,46 @@ rule run_raw_fastqc:
"""

########################## Human Host Removal ################################
rule raw_reads_human_reference_bwa_map:
rule raw_reads_composite_reference_bwa_map:
threads: 2
conda:
'conda_envs/snp_mapping.yaml'
output:
non_map_bam = '{sn}/host_removal/{sn}_human_non_mapping_reads.bam',
mapped_bam = '{sn}/host_removal/{sn}_human_mapping_reads.bam'
'{sn}/host_removal/{sn}_viral_and_nonmapping_reads.bam',
input:
raw_r1 = '{sn}/combined_raw_fastq/{sn}_R1.fastq.gz',
raw_r2 = '{sn}/combined_raw_fastq/{sn}_R2.fastq.gz'
benchmark:
"{sn}/benchmarks/{sn}_human_reference_bwa_map.benchmark.tsv"
"{sn}/benchmarks/{sn}_composite_reference_bwa_map.benchmark.tsv"
log:
'{sn}/host_removal/{sn}_human_read_mapping.log'
params:
human_index = os.path.join(exec_dir, config['human_reference'])
composite_index = os.path.join(exec_dir, config['composite_reference']),
script_path = os.path.join(exec_dir, "scripts", "filter_non_human_reads.py"),
viral_contig_name = config['viral_reference_contig_name']
shell:
'(bwa mem -t {threads} {params.human_index} '
'(bwa mem -t {threads} {params.composite_index} '
'{input.raw_r1} {input.raw_r2} | '
'samtools view -bS -q 30 -U {output.non_map_bam} -o {output.mapped_bam}) 2> {log} '
# filter all reads <30 MAPQ
'{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log}'

rule get_host_removed_reads:
threads: 2
conda: 'conda_envs/snp_mapping.yaml'
output:
r1 = '{sn}/host_removal/{sn}_R1.fastq',
r2 = '{sn}/host_removal/{sn}_R2.fastq',
bam = '{sn}/host_removal/{sn}_human_mapping_reads_filtered_sorted.bam'
r1 = '{sn}/host_removal/{sn}_R1.fastq.gz',
r2 = '{sn}/host_removal/{sn}_R2.fastq.gz',
s = '{sn}/host_removal/{sn}_singletons.fastq.gz',
bam = '{sn}/host_removal/{sn}_viral_and_nonmapping_reads_filtered_sorted.bam'
input:
'{sn}/host_removal/{sn}_human_non_mapping_reads.bam'
'{sn}/host_removal/{sn}_viral_and_nonmapping_reads.bam',
benchmark:
"{sn}/benchmarks/{sn}_get_host_removed_reads.benchmark.tsv"
log:
'{sn}/host_removal/{sn}_bamtofastq.log'
'{sn}/host_removal/{sn}_samtools_fastq.log'
shell:
"""
samtools view -b {input} | samtools sort -n -@{threads} > {output.bam} 2> {log}
bedtools bamtofastq -i {output.bam} -fq {output.r1} -fq2 {output.r2} 2>> {log}
"""

rule gzip_host_removed_reads:
output:
'{sn}/host_removal/{sn}_R1.fastq.gz',
'{sn}/host_removal/{sn}_R2.fastq.gz',
input:
'{sn}/host_removal/{sn}_R1.fastq',
'{sn}/host_removal/{sn}_R2.fastq',
shell:
"""
gzip {input}
samtools fastq -1 {output.r1} -2 {output.r2} -s {output.s} {output.bam} 2>> {log}
"""

###### Based on github.com/connor-lab/ncov2019-artic-nf/blob/master/modules/illumina.nf#L124 ######
Expand Down Expand Up @@ -351,33 +340,22 @@ rule get_mapping_reads:
priority: 2
conda: 'conda_envs/snp_mapping.yaml'
output:
r1 = '{sn}/mapped_clean_reads/{sn}_R1.fastq',
r2 = '{sn}/mapped_clean_reads/{sn}_R2.fastq',
r1 = '{sn}/mapped_clean_reads/{sn}_R1.fastq.gz',
r2 = '{sn}/mapped_clean_reads/{sn}_R2.fastq.gz',
s = '{sn}/mapped_clean_reads/{sn}_singletons.fastq.gz',
bam = '{sn}/mapped_clean_reads/{sn}_sorted_clean.bam'
input:
"{sn}/core/{sn}_viral_reference.mapping.primertrimmed.bam",
benchmark:
"{sn}/benchmarks/{sn}_get_mapping_reads.benchmark.tsv"
log:
'{sn}/mapped_clean_reads/{sn}_bamtofastq.log'
'{sn}/mapped_clean_reads/{sn}_samtools_fastq.log'
shell:
"""
samtools sort -n {input} -o {output.bam} 2> {log}
bedtools bamtofastq -i {output.bam} -fq {output.r1} -fq2 {output.r2} 2>> {log}
samtools fastq -1 {output.r1} -2 {output.r2} -s {output.s} {output.bam} 2>> {log}
"""

rule clean_reads_gzip:
priority: 2
output:
'{sn}/mapped_clean_reads/{sn}_R{r}.fastq.gz'
input:
'{sn}/mapped_clean_reads/{sn}_R{r}.fastq'
benchmark:
"{sn}/benchmarks/{sn}_clean_reads_gzip_{r}.benchmark.tsv"
shell:
'gzip {input}'


rule run_ivar_consensus:
conda:
'conda_envs/ivar.yaml'
Expand Down Expand Up @@ -447,7 +425,7 @@ rule run_breseq:
labelled_output_dir = '{sn}/breseq/{sn}_output'
shell:
"""
breseq --reference {params.ref} --num-processors {threads} --polymorphism-prediction --brief-html-output --output {params.outdir} {input} >{log} 2>&1
breseq --reference {params.ref} --num-processors {threads} --polymorphism-prediction --brief-html-output --output {params.outdir} {input} > {log} 2>&1
mv -T {params.unlabelled_output_dir} {params.labelled_output_dir}
"""

Expand Down
5 changes: 3 additions & 2 deletions conda_envs/snp_mapping.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ channels:
- bioconda
- defaults
dependencies:
- bwa=0.7.17=hed695b0_7
- bwa=0.7.17
- samtools=1.7=2
- bedtools=2.26.0=0
- breseq=0.35.0=h8b12597_0
- breseq=0.35.0
- pysam
13 changes: 8 additions & 5 deletions example_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# This file contains a high-level summary of pipeline configuration and inputs.
# It is ingested by the Snakefile, and also intended to be human-readable.

# Sample table (can be created using example_sample_table.csv)
samples: "sample_table.csv"

# Folder to place all results (can be relative or absolute path)
result_dir: "results_dir"

Expand All @@ -10,15 +13,17 @@ min_qual: 20
# Minimum read length to retain after trimming, used in 'trim_galore' and 'ivar trim'
min_len: 20

# path from snakemake dir to .bed file defining amplicon primer scheme
# Path from snakemake dir to .bed file defining amplicon primer scheme
scheme_bed: 'resources/primer_schemes/nCoV-2019_v3_fixed.bed'

# path from snakemake dir to bwa indexed human reference genome
human_reference: 'data/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna'
# Path from snakemake dir to bwa indexed human + viral reference genome
composite_reference: 'data/composite_human_viral_reference.fna'

# Used as bwa reference genome when removing host sequences.
# Also used as 'ivar' reference genome in variant detection + consensus.
# Used as -r,-g arguments to 'quast'
# contig needed for hostremoval filtering script
viral_reference_contig_name: 'MN908947.3'
viral_reference_genome: 'data/MN908947.3.fasta'
viral_reference_feature_coords: 'data/MN908947.3.gff3'

Expand All @@ -39,7 +44,5 @@ ivar_min_freq_threshold: 0.25
# iVar minimum mapQ to call variant (ivar variants: -q)
ivar_min_variant_quality: 20

samples: "sample_table.csv"

# fasta of sequences to include with phylo
phylo_include_seqs: "data/gisaid_hcov-19_2020_05_25_23.fasta"
67 changes: 67 additions & 0 deletions scripts/filter_non_human_reads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python
import pysam
import sys
import argparse

def filter_reads(contig_name, input_sam_fp, output_bam_fp):

# use streams if args are None
if input_sam_fp:
input_sam = pysam.AlignmentFile(input_sam_fp, 'r')
else:
input_sam = pysam.AlignmentFile('-', 'r')

if output_bam_fp:
output_bam = pysam.AlignmentFile(output_bam_fp, 'wb',
template=input_sam)
else:
output_bam = pysam.AlignmentFile('-', 'wb', template=input_sam)

# if read isn't mapped or mapped to viral reference contig name
viral_reads = 0
human_reads = 0
unmapped_reads = 0

# iterate over input from BWA
for read in input_sam:
# only look at primary alignments
if not read.is_supplementary and not read.is_secondary:
if read.reference_name == contig_name:
output_bam.write(read)
viral_reads += 1
elif read.is_unmapped:
output_bam.write(read)
unmapped_reads +=1
else:
human_reads += 1

total_reads = viral_reads + human_reads + unmapped_reads

print(f"viral read count = {viral_reads} "
f"({viral_reads/total_reads * 100:.2f}%)", file=sys.stderr)
print(f"human read count = {human_reads} "
f"({human_reads/total_reads * 100:.2f}%) ", file=sys.stderr)
print(f"unmapped read count = {unmapped_reads} "
f"({unmapped_reads/total_reads * 100:.2f}%)", file=sys.stderr)

if __name__ == '__main__':

parser = argparse.ArgumentParser(description="Get all reads that are "
"unmapped and map to a "
"specific reference "
"contig")
parser.add_argument('-i', '--input', required=False, default=False,
help="Input SAM formatted file (stdin used if "
" not specified)")

parser.add_argument('-o', '--output', required=False, default=False,
help="Output BAM formatted file (stdout used if not "
"specified)")

parser.add_argument('-c', '--contig_name', required=False,
default="MN908947.3",
help="Contig name to retain e.g. viral")

args = parser.parse_args()

filter_reads(args.contig_name, args.input, args.output)
23 changes: 15 additions & 8 deletions scripts/get_data_dependencies.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/bin/bash
#!/usr/bin/env bash

set -e # exit if pipeline returns non-zero status
set -o pipefail # return value of last command to exit with non-zero status
source ~/.bashrc

database_dir=0
accession="MN908947.3"
Expand Down Expand Up @@ -38,17 +39,23 @@ curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&
curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta

# install and activate env for lmat/kraken to build their databases
conda create -n data_dependencies -c conda-forge -c bioconda -y kraken2=2.0.8
CONDA_BASE=$(conda info --base)
#$CONDA_EXE create -n data_dependencies -c conda-forge -c bioconda -y kraken2=2.0.8 bwa
CONDA_BASE=$($CONDA_EXE info --base)
source $CONDA_BASE/etc/profile.d/conda.sh
conda activate data_dependencies

#
# get kraken2, and clean db after building
kraken2-build --download-taxonomy --db $database_dir/Kraken2/db --threads 10 --use-ftp
kraken2-build --download-library viral --db $database_dir/Kraken2/db --threads 10 --use-ftp
kraken2-build --build --threads 10 --db $database_dir/Kraken2/db
kraken2-build --clean --threads 10 --db $database_dir/Kraken2/db

# get the GRCh38 BWA index human genome
curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bwa_index.tar.gz" > $database_dir/GRCh38_bwa.tar.gz
tar xvf $database_dir/GRCh38_bwa.tar.gz -C $database_dir
#
# get the GRCh38 human genome
# as per https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz

# create composite reference of human and virus for competitive bwt mapping
# based host removal
cat $database_dir/GRC38_no_alt_analysis_set.fna $database_dir/$accession.fasta > $database_dir/composite_human_viral_reference.fna
bwa index $database_dir/composite_human_viral_reference.fna

0 comments on commit bdaee56

Please sign in to comment.