Merge pull request #86 from jaleezyy/composite_ref

Composite reference host removal
jaleezyy · Jun 25, 2020 · bdaee56 · bdaee56
2 parents f7c6f65 + 96f1c75
commit bdaee56
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 58 deletions.
diff --git a/Snakefile b/Snakefile
@@ -182,57 +182,46 @@ rule run_raw_fastqc:
         """
 
 ########################## Human Host Removal ################################
-rule raw_reads_human_reference_bwa_map:
+rule raw_reads_composite_reference_bwa_map:
     threads: 2
     conda: 
         'conda_envs/snp_mapping.yaml'
     output:
-        non_map_bam = '{sn}/host_removal/{sn}_human_non_mapping_reads.bam',
-        mapped_bam = '{sn}/host_removal/{sn}_human_mapping_reads.bam'
+        '{sn}/host_removal/{sn}_viral_and_nonmapping_reads.bam',
     input:
         raw_r1 = '{sn}/combined_raw_fastq/{sn}_R1.fastq.gz',
         raw_r2 = '{sn}/combined_raw_fastq/{sn}_R2.fastq.gz'
     benchmark:
-        "{sn}/benchmarks/{sn}_human_reference_bwa_map.benchmark.tsv"
+        "{sn}/benchmarks/{sn}_composite_reference_bwa_map.benchmark.tsv"
     log:
         '{sn}/host_removal/{sn}_human_read_mapping.log'
     params:
-       human_index = os.path.join(exec_dir, config['human_reference'])
+       composite_index = os.path.join(exec_dir, config['composite_reference']),
+       script_path = os.path.join(exec_dir, "scripts", "filter_non_human_reads.py"),
+       viral_contig_name = config['viral_reference_contig_name']
     shell:
-        '(bwa mem -t {threads} {params.human_index} '
+        '(bwa mem -t {threads} {params.composite_index} '
         '{input.raw_r1} {input.raw_r2} | '
-        'samtools view -bS -q 30 -U {output.non_map_bam} -o {output.mapped_bam}) 2> {log} '
-        # filter all reads <30 MAPQ
+        '{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log}'
 
 rule get_host_removed_reads:
     threads: 2
     conda: 'conda_envs/snp_mapping.yaml'
     output:
-        r1 = '{sn}/host_removal/{sn}_R1.fastq',
-        r2 = '{sn}/host_removal/{sn}_R2.fastq',
-        bam = '{sn}/host_removal/{sn}_human_mapping_reads_filtered_sorted.bam'
+        r1 = '{sn}/host_removal/{sn}_R1.fastq.gz',
+        r2 = '{sn}/host_removal/{sn}_R2.fastq.gz',
+        s = '{sn}/host_removal/{sn}_singletons.fastq.gz',
+        bam = '{sn}/host_removal/{sn}_viral_and_nonmapping_reads_filtered_sorted.bam'
     input:
-        '{sn}/host_removal/{sn}_human_non_mapping_reads.bam'
+        '{sn}/host_removal/{sn}_viral_and_nonmapping_reads.bam',
     benchmark:
         "{sn}/benchmarks/{sn}_get_host_removed_reads.benchmark.tsv"
     log:
-        '{sn}/host_removal/{sn}_bamtofastq.log'
+        '{sn}/host_removal/{sn}_samtools_fastq.log'
     shell:
         """
         samtools view -b {input} | samtools sort -n -@{threads} > {output.bam} 2> {log}
-        bedtools bamtofastq -i {output.bam} -fq {output.r1} -fq2 {output.r2} 2>> {log} 
-        """
-
-rule gzip_host_removed_reads:
-    output:
-        '{sn}/host_removal/{sn}_R1.fastq.gz',
-        '{sn}/host_removal/{sn}_R2.fastq.gz',
-    input:
-        '{sn}/host_removal/{sn}_R1.fastq',
-        '{sn}/host_removal/{sn}_R2.fastq',
-    shell:
-        """
-        gzip {input}
+        samtools fastq -1 {output.r1} -2 {output.r2} -s {output.s} {output.bam} 2>> {log} 
         """
 
 ###### Based on github.com/connor-lab/ncov2019-artic-nf/blob/master/modules/illumina.nf#L124 ######
@@ -351,33 +340,22 @@ rule get_mapping_reads:
     priority: 2
     conda: 'conda_envs/snp_mapping.yaml'
     output:
-        r1 = '{sn}/mapped_clean_reads/{sn}_R1.fastq',
-        r2 = '{sn}/mapped_clean_reads/{sn}_R2.fastq',
+        r1 = '{sn}/mapped_clean_reads/{sn}_R1.fastq.gz',
+        r2 = '{sn}/mapped_clean_reads/{sn}_R2.fastq.gz',
+        s = '{sn}/mapped_clean_reads/{sn}_singletons.fastq.gz',
         bam = '{sn}/mapped_clean_reads/{sn}_sorted_clean.bam'
     input:
         "{sn}/core/{sn}_viral_reference.mapping.primertrimmed.bam",
     benchmark:
         "{sn}/benchmarks/{sn}_get_mapping_reads.benchmark.tsv"
     log:
-        '{sn}/mapped_clean_reads/{sn}_bamtofastq.log'
+        '{sn}/mapped_clean_reads/{sn}_samtools_fastq.log'
     shell:
         """
         samtools sort -n {input} -o {output.bam} 2> {log}
-        bedtools bamtofastq -i {output.bam} -fq {output.r1} -fq2 {output.r2} 2>> {log} 
+        samtools fastq -1 {output.r1} -2 {output.r2} -s {output.s} {output.bam} 2>> {log} 
         """
 
-rule clean_reads_gzip:
-    priority: 2
-    output:
-        '{sn}/mapped_clean_reads/{sn}_R{r}.fastq.gz'
-    input:
-        '{sn}/mapped_clean_reads/{sn}_R{r}.fastq'
-    benchmark:
-        "{sn}/benchmarks/{sn}_clean_reads_gzip_{r}.benchmark.tsv"
-    shell:
-        'gzip {input}'
-
-
 rule run_ivar_consensus:
     conda: 
         'conda_envs/ivar.yaml'
@@ -447,7 +425,7 @@ rule run_breseq:
         labelled_output_dir = '{sn}/breseq/{sn}_output'
     shell:
         """
-        breseq --reference {params.ref} --num-processors {threads} --polymorphism-prediction --brief-html-output --output {params.outdir} {input} >{log} 2>&1
+        breseq --reference {params.ref} --num-processors {threads} --polymorphism-prediction --brief-html-output --output {params.outdir} {input} > {log} 2>&1
         mv -T {params.unlabelled_output_dir} {params.labelled_output_dir}
         """
 

diff --git a/conda_envs/snp_mapping.yaml b/conda_envs/snp_mapping.yaml
@@ -4,7 +4,8 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - bwa=0.7.17=hed695b0_7
+  - bwa=0.7.17
   - samtools=1.7=2
   - bedtools=2.26.0=0
-  - breseq=0.35.0=h8b12597_0
+  - breseq=0.35.0
+  - pysam
diff --git a/example_config.yaml b/example_config.yaml
@@ -1,6 +1,9 @@
 # This file contains a high-level summary of pipeline configuration and inputs.
 # It is ingested by the Snakefile, and also intended to be human-readable.
 
+# Sample table (can be created using example_sample_table.csv)
+samples: "sample_table.csv"
+
 # Folder to place all results (can be relative or absolute path)
 result_dir: "results_dir"
 
@@ -10,15 +13,17 @@ min_qual: 20
 # Minimum read length to retain after trimming, used in 'trim_galore' and 'ivar trim'
 min_len: 20
 
-# path from snakemake dir to .bed file defining amplicon primer scheme
+# Path from snakemake dir to .bed file defining amplicon primer scheme
 scheme_bed: 'resources/primer_schemes/nCoV-2019_v3_fixed.bed'
 
-# path from snakemake dir to bwa indexed human reference genome
-human_reference: 'data/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna'
+# Path from snakemake dir to bwa indexed human + viral reference genome
+composite_reference: 'data/composite_human_viral_reference.fna'
 
 # Used as bwa reference genome when removing host sequences.
 # Also used as 'ivar' reference genome in variant detection + consensus.
 # Used as -r,-g arguments to 'quast'
+# contig needed for hostremoval filtering script
+viral_reference_contig_name: 'MN908947.3'
 viral_reference_genome: 'data/MN908947.3.fasta'
 viral_reference_feature_coords: 'data/MN908947.3.gff3'
 
@@ -39,7 +44,5 @@ ivar_min_freq_threshold: 0.25
 # iVar minimum mapQ to call variant (ivar variants: -q)
 ivar_min_variant_quality: 20
 
-samples: "sample_table.csv"
-
 # fasta of sequences to include with phylo
 phylo_include_seqs: "data/gisaid_hcov-19_2020_05_25_23.fasta"
diff --git a/scripts/filter_non_human_reads.py b/scripts/filter_non_human_reads.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+import pysam
+import sys
+import argparse
+
+def filter_reads(contig_name, input_sam_fp, output_bam_fp):
+
+    # use streams if args are None
+    if input_sam_fp:
+        input_sam = pysam.AlignmentFile(input_sam_fp, 'r')
+    else:
+        input_sam = pysam.AlignmentFile('-', 'r')
+
+    if output_bam_fp:
+        output_bam = pysam.AlignmentFile(output_bam_fp, 'wb',
+                                         template=input_sam)
+    else:
+        output_bam = pysam.AlignmentFile('-', 'wb', template=input_sam)
+
+    # if read isn't mapped or mapped to viral reference contig name
+    viral_reads = 0
+    human_reads = 0
+    unmapped_reads = 0
+
+    # iterate over input from BWA
+    for read in input_sam:
+        # only look at primary alignments
+        if not read.is_supplementary and not read.is_secondary:
+            if read.reference_name == contig_name:
+                output_bam.write(read)
+                viral_reads += 1
+            elif read.is_unmapped:
+                output_bam.write(read)
+                unmapped_reads +=1
+            else:
+                human_reads += 1
+
+    total_reads = viral_reads + human_reads + unmapped_reads
+
+    print(f"viral read count = {viral_reads} "
+            f"({viral_reads/total_reads * 100:.2f}%)", file=sys.stderr)
+    print(f"human read count = {human_reads} "
+            f"({human_reads/total_reads * 100:.2f}%) ", file=sys.stderr)
+    print(f"unmapped read count = {unmapped_reads} "
+            f"({unmapped_reads/total_reads * 100:.2f}%)", file=sys.stderr)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description="Get all reads that are "
+                                                  "unmapped and map to a "
+                                                  "specific reference "
+                                                  "contig")
+    parser.add_argument('-i', '--input', required=False, default=False,
+                        help="Input SAM formatted file (stdin used if "
+                              " not specified)")
+
+    parser.add_argument('-o', '--output', required=False, default=False,
+                        help="Output BAM formatted file (stdout used if not "
+                             "specified)")
+
+    parser.add_argument('-c', '--contig_name', required=False,
+                        default="MN908947.3",
+                        help="Contig name to retain e.g. viral")
+
+    args = parser.parse_args()
+
+    filter_reads(args.contig_name, args.input, args.output)
diff --git a/scripts/get_data_dependencies.sh b/scripts/get_data_dependencies.sh
@@ -1,7 +1,8 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 set -e # exit if pipeline returns non-zero status
 set -o pipefail # return value of last command to exit with non-zero status
+source ~/.bashrc
 
 database_dir=0
 accession="MN908947.3"
@@ -38,17 +39,23 @@ curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&
 curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta
 
 # install and activate env for lmat/kraken to build their databases
-conda create -n data_dependencies -c conda-forge -c bioconda -y kraken2=2.0.8
-CONDA_BASE=$(conda info --base)
+#$CONDA_EXE create -n data_dependencies -c conda-forge -c bioconda -y kraken2=2.0.8 bwa
+CONDA_BASE=$($CONDA_EXE info --base)
 source $CONDA_BASE/etc/profile.d/conda.sh
 conda activate data_dependencies
-
+#
 # get kraken2, and clean db after building
 kraken2-build --download-taxonomy --db $database_dir/Kraken2/db --threads 10 --use-ftp
 kraken2-build --download-library viral --db $database_dir/Kraken2/db --threads 10 --use-ftp
 kraken2-build --build --threads 10 --db $database_dir/Kraken2/db
 kraken2-build --clean --threads 10 --db $database_dir/Kraken2/db
-
-# get the GRCh38 BWA index human genome
-curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.bwa_index.tar.gz" > $database_dir/GRCh38_bwa.tar.gz
-tar xvf $database_dir/GRCh38_bwa.tar.gz -C $database_dir
+#
+# get the GRCh38 human genome
+# as per https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
+curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
+gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz
+
+# create composite reference of human and virus for competitive bwt mapping 
+# based host removal
+cat $database_dir/GRC38_no_alt_analysis_set.fna $database_dir/$accession.fasta > $database_dir/composite_human_viral_reference.fna
+bwa index $database_dir/composite_human_viral_reference.fna