From 686daa7e4655bdcd38386f7143c60cbd7d3c1589 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 28 Sep 2020 00:03:30 +0200 Subject: [PATCH 01/63] Fix KeyError on adding new units --- workflow/scripts/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index a44bc791..c3754dc2 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -110,6 +110,8 @@ def parse_samples(df, config, PREPROCESS): for a in assem_list: if sample not in assemblies[a].keys(): assemblies[a][sample] = {unit: {}} + if unit not in assemblies[a][sample].keys(): + assemblies[a][sample][unit] = {} if r2: assemblies[a][sample][unit]["R1"] = [R1_p] assemblies[a][sample][unit]["R2"] = [R2_p] From ccde58a9ac51946a0350009b0bae55ca8885fa2c Mon Sep 17 00:00:00 2001 From: John Sundh Date: Mon, 28 Sep 2020 00:20:36 +0200 Subject: [PATCH 02/63] Fix relative path to eggnog-parser.py --- workflow/rules/annotation.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 95879870..c51dd2f5 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -255,7 +255,7 @@ rule get_kegg_info: opj("resources", "kegg", "download.log") params: outdir=lambda w, output: os.path.dirname(output[0]), - src="../scripts/eggnog-parser.py" + src="workflow/scripts/eggnog-parser.py" shell: """ python {params.src} download {params.outdir} > {log} 2>&1 @@ -371,7 +371,7 @@ rule parse_ko_annotations: params: outbase=opj(config["paths"]["results"], "annotation", "{assembly}"), resource_dir=opj("resources", "kegg"), - src="../scripts/eggnog-parser.py" + src="workflow/scripts/eggnog-parser.py" shell: """ python {params.src} parse {params.resource_dir} {input.annotations} \ From 938bbe544fe85745418a0e4b4aa7f3613c687bdd Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 28 Sep 2020 08:30:29 +0200 Subject: [PATCH 03/63] Increase runtime for pfam_scan --- workflow/rules/annotation.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index c51dd2f5..fd60c200 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -207,7 +207,7 @@ rule pfam_scan: tmp_out=opj(config["paths"]["temp"], "{assembly}.pfam.out") threads: 2 resources: - runtime=lambda wildcards, attempt: attempt**2*60*4 + runtime=lambda wildcards, attempt: attempt**2*60*10 shell: """ pfam_scan.pl -fasta {input[0]} -dir {params.dir} -cpu {threads} \ From c44492de59a7b0bfc189c1208c38d32704b4fc6b Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 28 Sep 2020 14:40:52 +0200 Subject: [PATCH 04/63] Remove utility function for markduplicates --- workflow/scripts/common.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index c3754dc2..a79b5709 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -664,21 +664,6 @@ def parse_cmout(f): ## Quantification functions -def markdup_mem(wildcards, cores): - """ - Calculates the memory to allocate when running MarkDuplicates - :param wildcards: - :param cores: number of cores for currently running workflow - :return: - """ - if cores is not None: - threads = min(cores, 10) - else: - threads = 1 - mem_gb_per_thread = 2 - return int(mem_gb_per_thread * threads) - - def get_fc_files(wildcards, file_type): g = wildcards.group files = [] From 0975afb4c5149494f35050ab00503b91830841c2 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 28 Sep 2020 14:41:38 +0200 Subject: [PATCH 05/63] Don't remove symlinks in fastANI rule --- workflow/rules/binning.smk | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk index e7450e8f..232f1be1 100644 --- a/workflow/rules/binning.smk +++ b/workflow/rules/binning.smk @@ -778,7 +778,6 @@ rule fastANI: fastANI --rl {input[0]} --ql {input[1]} -k {params.k} -t {threads} \ --fragLen {params.frag_len} --minFraction {params.fraction} \ --matrix -o {output[0]} > {log} 2>&1 - rm {params.indir}/*.fa {params.indir}/*.fna """ rule cluster_genomes: From 3caa6b79bdfb3a33ce844f1751d2ceac32725a7e Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 30 Sep 2020 14:44:54 +0200 Subject: [PATCH 06/63] Add workaround for MarkDuplicates PG error picard-tools MarkDuplicates threw an error when parsing the '@PG' key in the sam header. This workaround creates a temporary bam file with the PG key removed before running MarkDuplicates --- workflow/rules/quantification.smk | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 22c741d4..4f64dc5c 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -1,5 +1,3 @@ -from scripts.common import markdup_mem - localrules: quantify, write_featurefile, @@ -41,29 +39,37 @@ rule remove_mark_duplicates: log: opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", "{sample}_{unit}_{seq_type}.markdup.log") params: + header=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.header"), + rehead_bam=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.rehead.bam"), temp_bam=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.markdup.bam"), temp_sort_bam=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.markdup.re_sort.bam"), temp_dir=opj(config["paths"]["temp"], "{assembly}") threads: 10 resources: - runtime=lambda wildcards, attempt: attempt**2*60*4, - mem_mb=lambda wildcards: markdup_mem(wildcards, workflow.cores) + runtime=lambda wildcards, attempt: attempt**2*60*4 conda: "../envs/quantify.yml" shell: """ mkdir -p {params.temp_dir} - java -Xms2g -Xmx{resources.mem_mb}g -XX:ParallelGCThreads={threads} \ + # Fix bam header + samtools view -H {input} | egrep -v "^@PG" > {params.header} + samtools reheader -P {params.header} {input} > {params.rehead_bam} + # Set memory max + mem="-Xmx$((6 * {threads}))g" + java -Xms2g $mem -XX:ParallelGCThreads={threads} \ -jar $CONDA_PREFIX/share/picard-*/picard.jar MarkDuplicates \ - I={input} M={output[2]} O={params.temp_bam} REMOVE_DUPLICATES=TRUE \ - USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ASO=coordinate 2> {log} + I={params.rehead_bam} M={output[2]} O={params.temp_bam} REMOVE_DUPLICATES=TRUE \ + USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ASSUME_SORT_ORDER=coordinate \ + PROGRAM_RECORD_ID=null ADD_PG_TAG_TO_READS=FALSE 2> {log} # Re sort the bam file using samtools - samtools sort -o {params.temp_sort_bam} {params.temp_bam} + samtools_threads="$(({threads} - 1))" + samtools sort -@ $samtools_threads -o {params.temp_sort_bam} {params.temp_bam} > /dev/null 2>&1 # Index the bam file samtools index {params.temp_sort_bam} mv {params.temp_sort_bam} {output[0]} mv {params.temp_sort_bam}.bai {output[1]} - rm {params.temp_bam} + rm {params.temp_bam} {params.rehead_bam} {params.header} """ ##### featurecounts ##### From 0ec5643838697e5b4498644a704aa60a59162fc5 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 6 Oct 2020 12:04:30 +0200 Subject: [PATCH 07/63] Fix path to eggnog-parser.py script --- workflow/rules/quantification.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 4f64dc5c..5bea583f 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -182,7 +182,7 @@ rule quantify_features: opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{fc_type}.tsv") shell: """ - python source/utils/eggnog-parser.py \ + python workflow/scripts/eggnog-parser.py \ quantify {input.abund} {input.annot} {output[0]} """ From 4ec3083096be9cfdbb65b2c7117506919a69ec0e Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 7 Oct 2020 21:49:59 +0200 Subject: [PATCH 08/63] Add norm_method wildcard constraints --- workflow/rules/common.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index bf2cd2fb..f8e18459 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -35,7 +35,8 @@ wildcard_constraints: seq_type="[sp]e", binner="[a-z]+", group="\w+", - l="\d+" + l="\d+", + norm_method="(TMM|CSS|REL)" from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers From 7c68a479f072a535c10608c4bc92ddfd8c2f60bf Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 7 Oct 2020 21:50:23 +0200 Subject: [PATCH 09/63] Add conda env for normalization --- workflow/envs/normalize.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 workflow/envs/normalize.yml diff --git a/workflow/envs/normalize.yml b/workflow/envs/normalize.yml new file mode 100644 index 00000000..273cdfa9 --- /dev/null +++ b/workflow/envs/normalize.yml @@ -0,0 +1,8 @@ +name: normalize +channels: + - r + - bioconda + - defaults +dependencies: + - bioconductor-edger=3.30.0 + - bioconductor-metagenomeseq=1.30.0 From 905a8e23b3c81596422ff17a5c6488a4816c9b90 Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 7 Oct 2020 21:50:47 +0200 Subject: [PATCH 10/63] Add script for normalization of counts --- workflow/scripts/normalize.R | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 workflow/scripts/normalize.R diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R new file mode 100644 index 00000000..00f2624b --- /dev/null +++ b/workflow/scripts/normalize.R @@ -0,0 +1,42 @@ +#!/usr/bin/env Rscript + +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + +num_cols <- function(x) { + x <- x[, unlist(lapply(x, is.numeric))] + x +} + +str_cols <- function(x) { + x <- x[, unlist(lapply(x, is.character))] + x +} + +method <- snakemake@params$method + +# Read the counts +x <- read.delim(snakemake@input[[1]], row.names = 1) +# Remove unclassified +x <- x[row.names(x)!="Unclassified", ] +# Extract only numeric columns +x_num <- num_cols(x) + +if (method %in% c("TMM", "RLE")) { + library(edgeR, quietly = TRUE) + # Create DGE + obj <- DGEList(x_num) + # Calculate norm factors + obj <- calcNormFactors(obj, method = method) + # Calculate cpms + norm <- cpm(obj, normalized.lib.sizes = TRUE) +} else { + library(metagenomeSeq, quietly = TRUE) + obj <- newMRexperiment(x_num) + norm <- MRcounts(obj, norm = TRUE) +} + +norm <- cbind(str_cols(x), norm) + +write.table(x = norm, file = snakemake@output[[1]], quote = FALSE, sep="\t") From 3586286e6f4487d3d456472dd2aafd09504f5f06 Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 7 Oct 2020 21:52:15 +0200 Subject: [PATCH 11/63] Initial add of normalize rule --- workflow/rules/quantification.smk | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 5bea583f..3ab4872a 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -186,6 +186,20 @@ rule quantify_features: quantify {input.abund} {input.annot} {output[0]} """ +rule normalize: + input: + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.raw.tsv") + output: + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{norm_method}.tsv") + log: + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{norm_method}.log") + params: + method = "{norm_method}" + conda: + "../envs/normalize.yml" + script: + "../scripts/normalize.R" + rule sum_to_taxa: input: tax=opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", From 644369aa2403e120d6c1cef242eb3b41a999ca44 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 13:00:33 +0200 Subject: [PATCH 12/63] Generalize featurecounts to one rule --- workflow/rules/quantification.smk | 45 ++++++++----------------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 5bea583f..425cc515 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -74,56 +74,33 @@ rule remove_mark_duplicates: ##### featurecounts ##### -rule featurecount_pe: +rule featurecount: input: gff=opj(config["paths"]["results"], "annotation", "{assembly}", "final_contigs.features.gff"), bam=opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_pe"+POSTPROCESS+".bam") + "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam") output: opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", - "{sample}_{unit}_pe.fc.tsv"), + "{sample}_{unit}_{seq_type}.fc.tsv"), opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", - "{sample}_{unit}_pe.fc.tsv.summary") + "{sample}_{unit}_{seq_type}.fc.tsv.summary") log: opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_pe.fc.log") + "mapping", "{sample}_{unit}_{seq_type}.fc.log") threads: 4 - params: tmpdir=config["paths"]["temp"] - resources: - runtime=lambda wildcards, attempt: attempt**2*30 - conda: - "../envs/quantify.yml" - shell: - """ - featureCounts -a {input.gff} -o {output[0]} -t CDS -g gene_id -M -p \ - -B -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1 - """ - -rule featurecount_se: - input: - gff=opj(config["paths"]["results"], "annotation", "{assembly}", - "final_contigs.features.gff"), - bam=opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_se"+POSTPROCESS+".bam") - output: - opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_se.fc.tsv"), - opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_se.fc.tsv.summary") - log: - opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_se.fc.log") - threads: 4 - params: tmpdir=config["paths"]["temp"] + params: + tmpdir=config["paths"]["temp"], + setting=lambda wildcards: "-B -p" if wildcards.seq_type == "pe" else "" resources: runtime=lambda wildcards, attempt: attempt**2*30 conda: "../envs/quantify.yml" shell: """ - featureCounts -a {input.gff} -o {output[0]} -t CDS -g gene_id \ - -M -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1 + mkdir -p {params.tmpdir} + featureCounts -a {input.gff} -o {output[0]} -t CDS -g gene_id -M \ + {params.setting} -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1 """ rule samtools_stats: From 98b1e1fc12a71da16b54c37f067d2768549b7e2c Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 14:31:00 +0200 Subject: [PATCH 13/63] Add RPKM functionality to normalize script --- workflow/scripts/normalize.R | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R index 00f2624b..f81b636c 100644 --- a/workflow/scripts/normalize.R +++ b/workflow/scripts/normalize.R @@ -24,15 +24,25 @@ x <- x[row.names(x)!="Unclassified", ] x_num <- num_cols(x) if (method %in% c("TMM", "RLE")) { - library(edgeR, quietly = TRUE) + library(edgeR) # Create DGE obj <- DGEList(x_num) # Calculate norm factors obj <- calcNormFactors(obj, method = method) # Calculate cpms norm <- cpm(obj, normalized.lib.sizes = TRUE) +} else if (method == "RPKM") { + library(edgeR) + # Extract gene length column + gene_length <- x_num$Length + x_num <- x_num[, colnames(x_num) != "Length"] + obj <- DGEList(x_num) + # Calculate norm factors + obj <- calcNormFactors(obj, method = "TMM") + # Calculate RPKM + norm <- rpkm(obj, normalized.lib.sizes = TRUE, gene.length = gene_length) } else { - library(metagenomeSeq, quietly = TRUE) + library(metagenomeSeq) obj <- newMRexperiment(x_num) norm <- MRcounts(obj, norm = TRUE) } From 25d15f715a553aa36c825833b675c729278ac5d8 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 14:37:36 +0200 Subject: [PATCH 14/63] Add RPKM rule --- workflow/rules/quantification.smk | 73 ++++++++++++++++--------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 306e10d2..6485cd5b 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -2,7 +2,6 @@ localrules: quantify, write_featurefile, samtools_stats, - normalize_featurecount, aggregate_featurecount, sum_to_taxa, quantify_features, @@ -72,6 +71,20 @@ rule remove_mark_duplicates: rm {params.temp_bam} {params.rehead_bam} {params.header} """ +rule samtools_stats: + input: + opj(config["paths"]["results"], "assembly", "{assembly}", + "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam") + output: + opj(config["paths"]["results"], "assembly", "{assembly}", + "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats") + conda: + "../envs/quantify.yml" + shell: + """ + samtools stats {input} > {output} + """ + ##### featurecounts ##### rule featurecount: @@ -103,54 +116,44 @@ rule featurecount: {params.setting} -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1 """ -rule samtools_stats: - input: - opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam") - output: - opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats") - conda: - "../envs/quantify.yml" - shell: - """ - samtools stats {input} > {output} - """ - -rule normalize_featurecount: +rule clean_featurecount: input: opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", - "{sample}_{unit}_{seq_type}.fc.tsv"), - opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", - "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats") + "{sample}_{unit}_{seq_type}.fc.tsv") output: opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", - "{sample}_{unit}_{seq_type}.fc.tpm.tsv"), - opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", - "{sample}_{unit}_{seq_type}.fc.raw.tsv") - log: - opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", - "{sample}_{unit}_{seq_type}.fc.norm.log") + "{sample}_{unit}_{seq_type}.fc.clean.tsv") script: "../scripts/quantification_utils.py" rule aggregate_featurecount: - """Aggregates raw and normalized featureCounts files""" + """Aggregates all cleaned count files from featureCounts""" input: - raw_files=get_all_files(samples, opj(config["paths"]["results"], + get_all_files(samples, opj(config["paths"]["results"], "assembly", "{assembly}", "mapping"), - ".fc.raw.tsv"), - tpm_files=get_all_files(samples, opj(config["paths"]["results"], - "assembly", "{assembly}", "mapping"), - ".fc.tpm.tsv"), - gff_file=opj(config["paths"]["results"], "annotation", "{assembly}", - "final_contigs.features.gff") + ".fc.clean.tsv") output: - raw=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.raw.tsv"), - tpm=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.tpm.tsv") + opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv") script: "../scripts/quantification_utils.py" +rule rpkm: + """ + Calculate RPKM for genes in an assembly + """ + input: + opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv") + output: + opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.tsv") + log: + opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.log") + params: + method = "RPKM" + conda: + "../envs/normalize.yml" + script: + "../scripts/normalize.R" + rule quantify_features: input: abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv"), From 9ed0db71f6ceae3dd523956490cb68f5550eac90 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 14:38:40 +0200 Subject: [PATCH 15/63] Remove samtools stats --- workflow/rules/quantification.smk | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 6485cd5b..4afa0a67 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -1,8 +1,8 @@ localrules: quantify, write_featurefile, - samtools_stats, aggregate_featurecount, + clean_featurecount, sum_to_taxa, quantify_features, sum_to_rgi @@ -71,20 +71,6 @@ rule remove_mark_duplicates: rm {params.temp_bam} {params.rehead_bam} {params.header} """ -rule samtools_stats: - input: - opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam") - output: - opj(config["paths"]["results"], "assembly", "{assembly}", - "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats") - conda: - "../envs/quantify.yml" - shell: - """ - samtools stats {input} > {output} - """ - ##### featurecounts ##### rule featurecount: From 10c35596c04a7a9473593a37b271a13b6b0a1f6f Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 15:06:26 +0200 Subject: [PATCH 16/63] Perform check for unclassified column --- workflow/scripts/normalize.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R index f81b636c..c8c763da 100644 --- a/workflow/scripts/normalize.R +++ b/workflow/scripts/normalize.R @@ -19,7 +19,9 @@ method <- snakemake@params$method # Read the counts x <- read.delim(snakemake@input[[1]], row.names = 1) # Remove unclassified -x <- x[row.names(x)!="Unclassified", ] +if ("Unclassified" %in% row.names(x)){ + x <- x[row.names(x)!="Unclassified", ] +} # Extract only numeric columns x_num <- num_cols(x) From 667cba42f3b7f03b5ed1e09928c6bb55e17a4608 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 15:07:01 +0200 Subject: [PATCH 17/63] Drop TPM functionality --- workflow/scripts/quantification_utils.py | 169 +++++++++-------------- 1 file changed, 68 insertions(+), 101 deletions(-) diff --git a/workflow/scripts/quantification_utils.py b/workflow/scripts/quantification_utils.py index 8629fef0..1e780aaf 100644 --- a/workflow/scripts/quantification_utils.py +++ b/workflow/scripts/quantification_utils.py @@ -36,117 +36,82 @@ def write_featurefile(sm, score=".", group="gene_id", phase="."): return -def calculate_tpm(df, readlength, fhlog): - """ - Calculates transcripts per million normalized values for a sample based - on featureCounts output - :param df: pandas DataFrame as read from featureCounts output - :param readlength: average length of mapped reads - :param fhlog: loghandle - :return: pandas DataFrame with normalized values per gene - """ - sampleName = df.columns[-1] - # 1. Calculate t for sample - # t = (reads_mapped_to_gene * read_length) / length_of_gene - # Multiply gene counts with read length, - # then divide by the Length column (in kb) - fhlog.write("Normalizing by read length and gene length\n") - t = df[sampleName].multiply(readlength).div(df["Length"].div(1000)) - df = df.assign(t=pd.Series(t, index=df.index)) - # 2. Calculate T - # T = sum(t) - fhlog.write("Calculating sum of normalized values\n") - T = df["t"].sum() - # 3. Calculate TPM - # TPM = t*10^6 / T - fhlog.write("Normalization factor T is {}\n".format(T)) - fhlog.write("Calculating TPM\n") - TPM = (df["t"].multiply(1000000)).div(T) - df = df.assign(TPM=pd.Series(TPM, index=df.index)) - return df - - -def get_readlength(f): - """ - Reads samtools flagstat file and extract average mapped read length - :param f: samtools flagstat file - :return: - """ - with open(f, 'r') as fhin: - for line in fhin: - line = line.rstrip() - if line.startswith("SN") and "average length:" in line: - length = line.rsplit()[-1] - return float(length) - - -def normalize_featurecount(sm): - """ - Master rule for running normalization - :param sm: snakemake object - :return: - """ - df = pd.read_csv(sm.input[0], skiprows=1, sep="\t") +def clean_featurecount(sm): + """ + This cleans the featureCounts output table from format: + # Program:featureCounts v2.0.0; Command:"featureCounts" "-a" "etc." + Geneid Chr Start End Strand Length path/to/bam/sample.bam + 1_1 k141_7581 1 459 - 459 1 + 2_1 k141_0 469 714 + 246 2 + + To format: + gene_id Length sample + k141_7581_1 459 1 + k141_0_1 246 1 + """ + df = pd.read_csv(sm.input[0], comment="#", sep="\t") + # Extract gene number and combine with contig id + df["gene_num"] = [x[1] for x in df.Geneid.str.split("_")] + df.set_index(df.Chr.map(str) + "_" + df.gene_num, inplace=True) + df.drop("gene_num", axis=1, inplace=True) + df.index.name = 'gene_id' + # Set sample and unit name from wildcards sample_unit = "{sample}_{unit}".format(sample=sm.wildcards.sample, unit=sm.wildcards.unit) df.columns = list(df.columns)[0:-1] + [sample_unit] - # Get average mapped read length - readlength = get_readlength(sm.input[1]) - - # Perform normalization - with open(sm.log[0], 'w') as fhlog: - df = calculate_tpm(df, readlength, fhlog) - - df_tpm = df.iloc[:, [0, -1]] - df_tpm.columns = ["gene_id", sample_unit] - df_tpm.to_csv(sm.output[0], sep="\t", index=False) - - df_raw = df.iloc[:, [0, 6]] - df_raw.columns = ["gene_id", sample_unit] - df_raw.to_csv(sm.output[1], sep="\t", index=False) - - -def merge_files(files, gff_df): - """ - Merges abundance tables from several samples - :param files: list of files - :param gff_df: pandas DataFrame with orf ids in the format _ - :return: merged pandas DataFrame - """ - df = pd.DataFrame() - for f in files: - _df = pd.read_csv(f, index_col=0, sep="\t") - df = pd.concat([df, _df], axis=1) - df = pd.merge(df, gff_df, left_index=True, right_on="gene_id") - df.drop("gene_id", axis=1, inplace=True) - df.set_index("orf", inplace=True) - return df + # Extract length and counts + df = df.loc[:, ["Length", sample_unit]] + df.to_csv(sm.output[0], sep="\t") def aggregate_featurecount(sm): """ - Aggregates normalized and raw tables per sample into one table per assembly + Aggregates cleaned featureCounts tables into one table per assembly :param sm: snakemake object :return: """ - gff_df = pd.read_csv(sm.input.gff_file, header=None, usecols=[0, 8], - names=["contig", "gene"], sep="\t") - gff_df = gff_df.assign( - gene_id=pd.Series([x.replace("gene_id ", "") for x in gff_df.gene], - index=gff_df.index)) - gff_df = gff_df.assign( - suffix=pd.Series([x.split(" ")[-1].split("_")[-1] for x in gff_df.gene], - index=gff_df.index)) - gff_df = gff_df.assign( - orf=pd.Series(gff_df.contig + "_" + gff_df.suffix, index=gff_df.index)) - gff_df = gff_df[["orf", "gene_id"]] - - raw_df = merge_files(sm.input.raw_files, gff_df) - tpm_df = merge_files(sm.input.tpm_files, gff_df) - raw_df.to_csv(sm.output.raw, sep="\t") - tpm_df.to_csv(sm.output.tpm, sep="\t") - + df = pd.DataFrame() + lmap = {} + for f in sm.input: + _df = pd.read_csv(f, sep="\t", index_col=0) + lmap.update(_df.to_dict()["Length"]) + _df.drop("Length", axis=1, inplace=True) + df = pd.merge(df, _df, right_index=True, left_index=True, how="outer") + counts = pd.merge(df, pd.DataFrame(lmap, index=["Length"]).T, + left_index=True, right_index=True) + counts.to_csv(sm.output[0], sep="\t") + + +def process_and_sum(q_df, annot_df): + # Merge annotations and abundance + # keep ORFs without annotation as "Unclassified" + annot_q_df = pd.merge(annot_df, q_df, left_index=True, right_index=True, + how="right") + annot_q_df.fillna("Unclassified", inplace=True) + feature_cols = annot_df.columns + annot_q_sum = annot_q_df.groupby(list(feature_cols)).sum().reset_index() + annot_q_sum.set_index(feature_cols[0], inplace=True) + return annot_q_sum + + +def sum_to_features(abundance, parsed): + parsed_df = pd.read_csv(parsed, index_col=0, sep="\t") + abundance_df = pd.read_csv(abundance, index_col=0, sep="\t") + abundance_df.drop("Length", axis=1, inplace=True, errors="ignore") + feature_sum = process_and_sum(abundance_df, parsed_df) + return feature_sum + + +def count_features(sm): + """ + Counts reads mapped to features such as KOs, PFAMs etc. + + :param sm: + :return: + """ + feature_sum = sum_to_features(sm.input.abundance, sm.input.parsed) + feature_sum.to_csv(sm.output[0], sep="\t") def sum_to_taxa(sm): """ @@ -184,10 +149,12 @@ def sum_to_rgi(sm): dfsum = df.groupby("AMR Gene Family").sum() dfsum.to_csv(sm.output[0], sep="\t", index=True, header=True) + def main(sm): toolbox = {"write_featurefile": write_featurefile, - "normalize_featurecount": normalize_featurecount, + "clean_featurecount": clean_featurecount, "aggregate_featurecount": aggregate_featurecount, + "count_features": count_features, "sum_to_taxa": sum_to_taxa, "sum_to_rgi": sum_to_rgi} From 3520bce43e9b2025b254fcaf810482c1ad636676 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 15:08:16 +0200 Subject: [PATCH 18/63] Clean up quantification with standard normalizations This commit removes support for TPM normalization and attempts to clean up how genes and features are quantified and normalized --- workflow/rules/quantification.smk | 44 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 4afa0a67..c494b6f2 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -3,8 +3,9 @@ localrules: write_featurefile, aggregate_featurecount, clean_featurecount, + count_features, + normalize_features, sum_to_taxa, - quantify_features, sum_to_rgi ##### quantify master rule ##### @@ -12,8 +13,8 @@ localrules: rule quantify: input: expand(opj(config["paths"]["results"], "annotation", "{assembly}", - "fc.{fc_type}.tsv"), - assembly=assemblies.keys(), fc_type=["tpm", "raw"]) + "gene_{counts_type}.tsv"), + assembly=assemblies.keys(), counts_type=["counts", "rpkm"]) rule write_featurefile: @@ -115,11 +116,11 @@ rule clean_featurecount: rule aggregate_featurecount: """Aggregates all cleaned count files from featureCounts""" input: - get_all_files(samples, opj(config["paths"]["results"], - "assembly", "{assembly}", "mapping"), - ".fc.clean.tsv") + get_all_files(samples=samples, + dir=opj(config["paths"]["results"],"assembly", "{assembly}", "mapping"), + suffix=".fc.clean.tsv") output: - opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv") + opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv") script: "../scripts/quantification_utils.py" @@ -128,9 +129,9 @@ rule rpkm: Calculate RPKM for genes in an assembly """ input: - opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv") + opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv") output: - opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.tsv") + opj(config["paths"]["results"], "annotation", "{assembly}", "gene_rpkm.tsv") log: opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.log") params: @@ -140,21 +141,18 @@ rule rpkm: script: "../scripts/normalize.R" -rule quantify_features: +rule count_features: input: - abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv"), + abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv"), annot=opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.tsv") output: - opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{fc_type}.tsv") - shell: - """ - python workflow/scripts/eggnog-parser.py \ - quantify {input.abund} {input.annot} {output[0]} - """ + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv") + script: + "../scripts/quantification_utils.py" -rule normalize: +rule normalize_features: input: - opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.raw.tsv") + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv") output: opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{norm_method}.tsv") log: @@ -170,18 +168,18 @@ rule sum_to_taxa: input: tax=opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "orfs.{db}.taxonomy.tsv".format(db=config["taxonomy"]["database"])), - abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv") + abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_{counts_type}.tsv") output: - opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "tax.{fc_type}.tsv") + opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "tax.{counts_type}.tsv") script: "../scripts/quantification_utils.py" rule sum_to_rgi: input: annot=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out.txt"), - abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv") + abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_{counts_type}.tsv") output: - opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.{fc_type}.tsv") + opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.{counts_type}.tsv") script: "../scripts/quantification_utils.py" From f52228aeb603fdfe35c5463de8fad7e24500c0bc Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 15:08:37 +0200 Subject: [PATCH 19/63] Add wildcard constraints for counts_type --- workflow/rules/common.smk | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f8e18459..6c9f52cc 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -36,6 +36,7 @@ wildcard_constraints: binner="[a-z]+", group="\w+", l="\d+", + counts_type="(counts|rpkm)", norm_method="(TMM|CSS|REL)" from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers From cae1c920778285d60cbc95eced4cd38cd9fb4cd1 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 8 Oct 2020 15:13:52 +0200 Subject: [PATCH 20/63] Update quantified feature output --- workflow/scripts/common.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index a79b5709..d8848a0f 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -611,22 +611,25 @@ def annotation_input(config, assemblies): # Add EGGNOG annotation if config["annotation"]["eggnog"]: input += expand(opj(config["paths"]["results"], "annotation", group, - "{db}.parsed.{fc}.tsv"), + "{db}.parsed.{norm_method}.tsv"), db=["enzymes", "pathways", "kos", "modules"], - fc=["raw", "tpm"]) + norm_method=["counts", "TMM", "REL", "CSS"]) # Add PFAM annotation if config["annotation"]["pfam"]: input += expand(opj(config["paths"]["results"], "annotation", group, - "pfam.parsed.{fc}.tsv"), fc=["tpm", "raw"]) + "pfam.parsed.{norm_method}.tsv"), + norm_method=["counts", "TMM", "REL", "CSS"]) # Add taxonomic annotation if config["annotation"]["taxonomy"]: input += expand( opj(config["paths"]["results"], "annotation", group, "taxonomy", - "tax.{fc}.tsv"), fc=["tpm", "raw"]) + "tax.{counts_type}.tsv"), + counts_type=["counts", "rpkm"]) # Add Resistance Gene Identifier output if config["annotation"]["rgi"]: input += expand(opj(config["paths"]["results"], "annotation", group, - "rgi.{fc}.tsv"), fc=["raw", "tpm"]) + "rgi.{norm_method}.tsv"), + norm_method=["counts", "TMM", "REL", "CSS"]) input.append(opj(config["paths"]["results"], "annotation", group, "rgi.out.txt")) return input From 40b57fb04671395c6946bbe32a61d6d75887f6d6 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 11:23:25 +0200 Subject: [PATCH 21/63] Remove length column when quantifying taxa --- workflow/scripts/quantification_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/scripts/quantification_utils.py b/workflow/scripts/quantification_utils.py index 1e780aaf..21bf9797 100644 --- a/workflow/scripts/quantification_utils.py +++ b/workflow/scripts/quantification_utils.py @@ -126,6 +126,8 @@ def sum_to_taxa(sm): df = pd.read_csv(sm.input.tax, sep="\t", index_col=0, header=None, names=header) abund_df = pd.read_csv(sm.input.abund, header=0, index_col=0, sep="\t") + # Remove length column + abund_df.drop("Length", axis=1, inplace=True, errors="ignore") taxa_abund = pd.merge(df, abund_df, right_index=True, left_index=True) taxa_abund_sum = taxa_abund.groupby(header[1:]).sum().reset_index() taxa_abund_sum.to_csv(sm.output[0], sep="\t", index=False) From 465ba135ba3c744f03ece2060ca25b058d40c9ee Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 11:23:54 +0200 Subject: [PATCH 22/63] Normalize counts after summing for RGI --- workflow/scripts/common.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index d8848a0f..0e9b06b2 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -628,10 +628,8 @@ def annotation_input(config, assemblies): # Add Resistance Gene Identifier output if config["annotation"]["rgi"]: input += expand(opj(config["paths"]["results"], "annotation", group, - "rgi.{norm_method}.tsv"), + "rgi.parsed.{norm_method}.tsv"), norm_method=["counts", "TMM", "REL", "CSS"]) - input.append(opj(config["paths"]["results"], "annotation", group, - "rgi.out.txt")) return input From 1ee63292796dcfd2004048f6e46eda01f59d7788 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 11:27:42 +0200 Subject: [PATCH 23/63] Update RGI version --- workflow/envs/rgi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/rgi.yml b/workflow/envs/rgi.yml index 85cf0673..b0657038 100644 --- a/workflow/envs/rgi.yml +++ b/workflow/envs/rgi.yml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - rgi=4.0.3 + - rgi=5.1.1 From cc2aea658d8a7ce6f419b124399b49a5c520b8f8 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 13:42:02 +0200 Subject: [PATCH 24/63] Update snakemake input for summing to features --- workflow/scripts/quantification_utils.py | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/workflow/scripts/quantification_utils.py b/workflow/scripts/quantification_utils.py index 21bf9797..deddaf3b 100644 --- a/workflow/scripts/quantification_utils.py +++ b/workflow/scripts/quantification_utils.py @@ -110,7 +110,7 @@ def count_features(sm): :param sm: :return: """ - feature_sum = sum_to_features(sm.input.abundance, sm.input.parsed) + feature_sum = sum_to_features(sm.input.abund, sm.input.annot) feature_sum.to_csv(sm.output[0], sep="\t") def sum_to_taxa(sm): @@ -133,32 +133,12 @@ def sum_to_taxa(sm): taxa_abund_sum.to_csv(sm.output[0], sep="\t", index=False) -def sum_to_rgi(sm): - """ - Takes Resistance gene identifier output and abundance info per orf and sums - values to gene family level - - :param sm: snakemake object - :return: - """ - annot = pd.read_csv(sm.input.annot, sep="\t", header=0, index_col=0, - usecols=[0, 16]) - # Rename index for annotations to remove text after whitespace - annot.rename(index=lambda x: x.split(" ")[0], inplace=True) - abund = pd.read_csv(sm.input.abund, sep="\t", header=0, index_col=0) - df = pd.merge(annot, abund, left_index=True, right_index=True) - # Sum to Gene family - dfsum = df.groupby("AMR Gene Family").sum() - dfsum.to_csv(sm.output[0], sep="\t", index=True, header=True) - - def main(sm): toolbox = {"write_featurefile": write_featurefile, "clean_featurecount": clean_featurecount, "aggregate_featurecount": aggregate_featurecount, "count_features": count_features, - "sum_to_taxa": sum_to_taxa, - "sum_to_rgi": sum_to_rgi} + "sum_to_taxa": sum_to_taxa} toolbox[sm.rule](sm) From 42ea0e67e4b9ab9e549c16ad9dc4e424a9a1e879 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 13:42:25 +0200 Subject: [PATCH 25/63] Update RGI rule to match version change --- workflow/rules/annotation.smk | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index fd60c200..883a9961 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -411,6 +411,7 @@ rule rgi: params: out=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out"), settings="-a diamond --local --clean --input_type protein" + shadow: "minimal" conda: "../envs/rgi.yml" threads: 10 @@ -418,7 +419,15 @@ rule rgi: runtime=lambda wildcards, attempt: attempt**2*60 shell: """ - rgi load --card_json {input.db} --local > {log} 2>&1 + rgi load -i {input.db} --local > {log} 2>&1 rgi main -i {input.faa} -o {params.out} \ -n {threads} {params.settings} >>{log} 2>>{log} """ + +rule parse_rgi: + input: + txt=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out.txt") + output: + tsv=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.parsed.tsv") + script: + "../scripts/annotation_utils.py" From f4fd582fff64f22902a0b542c437da80878968d5 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 13:45:06 +0200 Subject: [PATCH 26/63] Add function to parse RGI output similarly to PFAM output --- workflow/scripts/annotation_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/annotation_utils.py b/workflow/scripts/annotation_utils.py index 09bf745f..b878252c 100644 --- a/workflow/scripts/annotation_utils.py +++ b/workflow/scripts/annotation_utils.py @@ -3,6 +3,13 @@ import pandas as pd +def parse_rgi(sm): + annot = pd.read_csv(sm.input.txt, sep="\t", index_col=0) + annot = annot.loc[:, ["AMR Gene Family", "Resistance Mechanism"]] + annot.rename(index=lambda x: x.split(" ")[0], inplace=True) + annot.to_csv(sm.output.tsv, sep="\t", index=True) + + def parse_pfam(sm): annot = pd.read_csv(sm.input[0], comment="#", header=None, sep=" +", usecols=[0, 5, 7, 14], engine="python", @@ -28,7 +35,8 @@ def parse_pfam(sm): def main(sm): - toolbox = {"parse_pfam": parse_pfam} + toolbox = {"parse_pfam": parse_pfam, + "parse_rgi": parse_rgi} toolbox[sm.rule](sm) From 070cda4e497b55688126afd4d9c397e4e85c10a1 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 13:51:21 +0200 Subject: [PATCH 27/63] Remove specific rule for summing RGI --- workflow/rules/quantification.smk | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index c494b6f2..b5dbd44f 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -142,6 +142,9 @@ rule rpkm: "../scripts/normalize.R" rule count_features: + """ + Sums read counts for gene annotation features such as pfam, KOs etc. + """ input: abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv"), annot=opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.tsv") @@ -151,6 +154,9 @@ rule count_features: "../scripts/quantification_utils.py" rule normalize_features: + """ + Normalizes counts of features using TMM, REL or CSS + """ input: opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv") output: @@ -165,6 +171,9 @@ rule normalize_features: "../scripts/normalize.R" rule sum_to_taxa: + """ + Sums read counts and RPKM values for genes to assigned taxonomy + """ input: tax=opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "orfs.{db}.taxonomy.tsv".format(db=config["taxonomy"]["database"])), @@ -173,13 +182,3 @@ rule sum_to_taxa: opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "tax.{counts_type}.tsv") script: "../scripts/quantification_utils.py" - -rule sum_to_rgi: - input: - annot=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out.txt"), - abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_{counts_type}.tsv") - output: - opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.{counts_type}.tsv") - script: - "../scripts/quantification_utils.py" - From 70f4f64fe04044572b6b3af5f9dffe6fbe046fac Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 21:36:27 +0200 Subject: [PATCH 28/63] Fix spelling of RLE method --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 6c9f52cc..5ec04ede 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -37,7 +37,7 @@ wildcard_constraints: group="\w+", l="\d+", counts_type="(counts|rpkm)", - norm_method="(TMM|CSS|REL)" + norm_method="(TMM|CSS|RLE)" from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers From 53e9ada8a2a7cb53642b362be0b4ab3f0149e80a Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 21:37:33 +0200 Subject: [PATCH 29/63] Fix spelling of RLE method --- workflow/scripts/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 0e9b06b2..378c6350 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -613,12 +613,12 @@ def annotation_input(config, assemblies): input += expand(opj(config["paths"]["results"], "annotation", group, "{db}.parsed.{norm_method}.tsv"), db=["enzymes", "pathways", "kos", "modules"], - norm_method=["counts", "TMM", "REL", "CSS"]) + norm_method=["counts", "TMM", "RLE", "CSS"]) # Add PFAM annotation if config["annotation"]["pfam"]: input += expand(opj(config["paths"]["results"], "annotation", group, "pfam.parsed.{norm_method}.tsv"), - norm_method=["counts", "TMM", "REL", "CSS"]) + norm_method=["counts", "TMM", "RLE", "CSS"]) # Add taxonomic annotation if config["annotation"]["taxonomy"]: input += expand( @@ -629,7 +629,7 @@ def annotation_input(config, assemblies): if config["annotation"]["rgi"]: input += expand(opj(config["paths"]["results"], "annotation", group, "rgi.parsed.{norm_method}.tsv"), - norm_method=["counts", "TMM", "REL", "CSS"]) + norm_method=["counts", "TMM", "RLE", "CSS"]) return input From dc7b801b92709c5e0ad98d67365946e00d3b5bbb Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 21:37:53 +0200 Subject: [PATCH 30/63] Include unique CARD model id --- workflow/scripts/annotation_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/annotation_utils.py b/workflow/scripts/annotation_utils.py index b878252c..657957bf 100644 --- a/workflow/scripts/annotation_utils.py +++ b/workflow/scripts/annotation_utils.py @@ -5,7 +5,8 @@ def parse_rgi(sm): annot = pd.read_csv(sm.input.txt, sep="\t", index_col=0) - annot = annot.loc[:, ["AMR Gene Family", "Resistance Mechanism"]] + annot = annot.loc[:, ["Model_ID", "AMR Gene Family", "Resistance Mechanism"]] + annot.loc[:, "Model_ID"] = ["RGI_{}".format(x) for x in annot.Model_ID] annot.rename(index=lambda x: x.split(" ")[0], inplace=True) annot.to_csv(sm.output.tsv, sep="\t", index=True) From 5b099b10ef61f1daa49c83882845ad3e3cdfa5f2 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 21:52:39 +0200 Subject: [PATCH 31/63] Add minimum mapping qual for featureCounts --- workflow/rules/quantification.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index b5dbd44f..7cd9441f 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -91,7 +91,7 @@ rule featurecount: threads: 4 params: tmpdir=config["paths"]["temp"], - setting=lambda wildcards: "-B -p" if wildcards.seq_type == "pe" else "" + setting=lambda wildcards: "-Q 10 -B -p" if wildcards.seq_type == "pe" else "" resources: runtime=lambda wildcards, attempt: attempt**2*30 conda: From 69d102addaf2fb944a16a1345a1bd605adfdf8b2 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 22:30:18 +0200 Subject: [PATCH 32/63] Handle cases where samples have only 1 feature --- workflow/scripts/normalize.R | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R index c8c763da..66bdb489 100644 --- a/workflow/scripts/normalize.R +++ b/workflow/scripts/normalize.R @@ -17,11 +17,16 @@ str_cols <- function(x) { method <- snakemake@params$method # Read the counts -x <- read.delim(snakemake@input[[1]], row.names = 1) +x <- read.delim(snakemake@input[[1]], row.names = 1, sep = "\t", header = TRUE) # Remove unclassified if ("Unclassified" %in% row.names(x)){ x <- x[row.names(x)!="Unclassified", ] } +if (nrow(x)==0) { + write.table(x, file = snakemake@output[[1]], quote = FALSE, sep="\t") + quit() +} + # Extract only numeric columns x_num <- num_cols(x) @@ -46,6 +51,16 @@ if (method %in% c("TMM", "RLE")) { } else { library(metagenomeSeq) obj <- newMRexperiment(x_num) + smat = lapply(1:ncol(x_num), function(i) { + sort(x_num[which(x_num[, i]>0),i], decreasing = TRUE) + }) + if(any(sapply(smat,length)==1)) { + fh <-file(snakemake@output[[1]]) + writeLines(c("WARNING: Sample with one or zero features", + "Cumulative Sum Scaling failed for sample"), fh) + close(fh) + quit() + } norm <- MRcounts(obj, norm = TRUE) } From 05c949b2e26dcf282bdcf11770d648e3c9a5d07c Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 22:31:11 +0200 Subject: [PATCH 33/63] Add testing for RGI --- .github/workflows/main.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 14b2bdde..c97a0034 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -49,10 +49,13 @@ jobs: - name: Kraken run: | snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport - - name: Metaspades + - name: Metaspades + RGI run: | snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p assemble rm -r results/assembly results/report examples/data/sample* + - name: RGI + run: | + snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate - name: Prepare taxonomy run: bash .test/scripts/prep_taxonomy.sh - name: Taxonomy From 362762bfcafd29880727e251f9ee218a2fc75231 Mon Sep 17 00:00:00 2001 From: johnne Date: Mon, 12 Oct 2020 22:31:37 +0200 Subject: [PATCH 34/63] Update metaspades test config --- .test/config/metaspades.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.test/config/metaspades.yaml b/.test/config/metaspades.yaml index 96b5ad14..e619bec8 100644 --- a/.test/config/metaspades.yaml +++ b/.test/config/metaspades.yaml @@ -117,13 +117,13 @@ annotation: # run tRNAscan-SE? tRNAscan: False # run infernal for rRNA identification? - infernal: True + infernal: False # run eggnog-mapper to infer KEGG orthologs, pathways and modules? eggnog: False # run PFAM-scan to infer protein families from PFAM? - pfam: True + pfam: False # run Resistance gene identifier? - rgi: False + rgi: True # run taxonomic annotation of assembled contigs (using tango + sourmash)? taxonomy: False From 819643a392dc9d8f5a71cef6b96852490a60c683 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 08:11:08 +0200 Subject: [PATCH 35/63] Update localrules --- workflow/rules/quantification.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 7cd9441f..95bc40f8 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -1,12 +1,12 @@ localrules: quantify, write_featurefile, - aggregate_featurecount, clean_featurecount, + aggregate_featurecount, + rpkm, count_features, normalize_features, - sum_to_taxa, - sum_to_rgi + sum_to_taxa ##### quantify master rule ##### From 0a02f260a1e77662f9810d773255575021910efe Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 08:19:18 +0200 Subject: [PATCH 36/63] Update localrules --- workflow/rules/annotation.smk | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 883a9961..1a3c9e3e 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -2,16 +2,17 @@ from scripts.common import annotation_input localrules: annotate, - parse_ko_annotations, - parse_pfam, download_rfams, press_rfams, download_pfam, - press_pfam, download_pfam_info, + press_pfam, + parse_pfam, download_eggnog, get_kegg_info, - download_rgi_data + parse_ko_annotations, + download_rgi_data, + parse_rgi ##### annotation master rule ##### From db2405f3b29d2d9de33434567247ea4db439f20f Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 08:22:38 +0200 Subject: [PATCH 37/63] Update localrules --- workflow/rules/assembly.smk | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 6246192f..35890f2a 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -16,6 +16,8 @@ rule assemble: opj(config["paths"]["results"], "report", "assembly", "alignment_frequency.pdf") if config["assembly"]["metaspades"]: + localrules: + generate_metaspades_input rule generate_metaspades_input: """Generate input files for use with Metaspades""" input: @@ -97,6 +99,8 @@ if config["assembly"]["metaspades"]: """ else: + localrules: + generate_megahit_input rule generate_megahit_input: """Generate input lists for Megahit""" input: From 66758a64ca3a56de63dea1e948113a24ae8072c5 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 08:33:56 +0200 Subject: [PATCH 38/63] Update localrules --- workflow/rules/binning.smk | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk index 232f1be1..c341f678 100644 --- a/workflow/rules/binning.smk +++ b/workflow/rules/binning.smk @@ -22,7 +22,6 @@ localrules: cluster_genomes, count_rRNA, count_tRNA, - aggregate_gtdbtk, aggregate_bin_annot, binning_report From 960b2c85182808da205bbd561a6a888d54a04ef4 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 08:42:55 +0200 Subject: [PATCH 39/63] Change channel order --- workflow/envs/normalize.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/normalize.yml b/workflow/envs/normalize.yml index 273cdfa9..6b0d5133 100644 --- a/workflow/envs/normalize.yml +++ b/workflow/envs/normalize.yml @@ -1,7 +1,7 @@ name: normalize channels: - - r - bioconda + - r - defaults dependencies: - bioconductor-edger=3.30.0 From 11d01e395d6bbc74f0e38ccb7484d5aac3f9a554 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 09:23:01 +0200 Subject: [PATCH 40/63] Clear versions for normalize.yml --- workflow/envs/normalize.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflow/envs/normalize.yml b/workflow/envs/normalize.yml index 6b0d5133..f60900c0 100644 --- a/workflow/envs/normalize.yml +++ b/workflow/envs/normalize.yml @@ -1,8 +1,7 @@ name: normalize channels: - bioconda - - r - defaults dependencies: - - bioconductor-edger=3.30.0 - - bioconductor-metagenomeseq=1.30.0 + - bioconductor-edger + - bioconductor-metagenomeseq From 98e219649bacaa0c666aaf2bc926b3247b6f45e9 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:17:56 +0200 Subject: [PATCH 41/63] Formatting fixes after running pre-commit --- workflow/envs/annotation.yml | 2 +- workflow/envs/barrnap.yml | 2 +- workflow/envs/centrifuge.yml | 2 +- workflow/envs/checkm.yml | 2 +- workflow/envs/concoct.yml | 2 +- workflow/envs/cookiecutter.yml | 2 +- workflow/envs/graphlan.yml | 2 +- workflow/envs/kraken.yml | 2 +- workflow/envs/krona.yml | 2 +- workflow/envs/maxbin.yml | 2 +- workflow/envs/megahit.yml | 2 +- workflow/envs/metabat.yml | 2 +- workflow/envs/metaphlan.yml | 2 +- workflow/envs/metaspades.yml | 2 +- workflow/envs/plotting.yml | 2 +- workflow/envs/preprocess.yml | 2 +- workflow/envs/quantify.yml | 2 +- 17 files changed, 17 insertions(+), 17 deletions(-) diff --git a/workflow/envs/annotation.yml b/workflow/envs/annotation.yml index f3daf0a7..924d0138 100644 --- a/workflow/envs/annotation.yml +++ b/workflow/envs/annotation.yml @@ -8,4 +8,4 @@ dependencies: - pfam_scan=1.6 - eggnog-mapper=2.0.1 - infernal=1.1.2 - - trnascan-se=2.0.5 \ No newline at end of file + - trnascan-se=2.0.5 diff --git a/workflow/envs/barrnap.yml b/workflow/envs/barrnap.yml index 53ae4d61..ff933f29 100644 --- a/workflow/envs/barrnap.yml +++ b/workflow/envs/barrnap.yml @@ -4,4 +4,4 @@ channels: - conda-forge - defaults dependencies: - - barrnap=0.9 \ No newline at end of file + - barrnap=0.9 diff --git a/workflow/envs/centrifuge.yml b/workflow/envs/centrifuge.yml index d4579e74..af69b00d 100644 --- a/workflow/envs/centrifuge.yml +++ b/workflow/envs/centrifuge.yml @@ -2,4 +2,4 @@ channels: - bioconda - defaults dependencies: - - centrifuge=1.0.4_beta \ No newline at end of file + - centrifuge=1.0.4_beta diff --git a/workflow/envs/checkm.yml b/workflow/envs/checkm.yml index 35cc6929..d0488e59 100644 --- a/workflow/envs/checkm.yml +++ b/workflow/envs/checkm.yml @@ -5,4 +5,4 @@ channels: - defaults dependencies: - python=3.7.6 - - checkm-genome=1.1.2 \ No newline at end of file + - checkm-genome=1.1.2 diff --git a/workflow/envs/concoct.yml b/workflow/envs/concoct.yml index f5f23857..b3a12405 100644 --- a/workflow/envs/concoct.yml +++ b/workflow/envs/concoct.yml @@ -5,4 +5,4 @@ channels: dependencies: - python=3.7.6 - biopython=1.76 - - concoct=1.1.0 \ No newline at end of file + - concoct=1.1.0 diff --git a/workflow/envs/cookiecutter.yml b/workflow/envs/cookiecutter.yml index 35633e58..5ea2b60a 100644 --- a/workflow/envs/cookiecutter.yml +++ b/workflow/envs/cookiecutter.yml @@ -2,4 +2,4 @@ name: cookiecutter channels: - conda-forge dependencies: - - cookiecutter \ No newline at end of file + - cookiecutter diff --git a/workflow/envs/graphlan.yml b/workflow/envs/graphlan.yml index 68c09ae9..6bb097c6 100644 --- a/workflow/envs/graphlan.yml +++ b/workflow/envs/graphlan.yml @@ -2,4 +2,4 @@ channels: - bioconda dependencies: - graphlan=1.1.3 - - export2graphlan=0.20 \ No newline at end of file + - export2graphlan=0.20 diff --git a/workflow/envs/kraken.yml b/workflow/envs/kraken.yml index f582d7e7..a8e77cb4 100644 --- a/workflow/envs/kraken.yml +++ b/workflow/envs/kraken.yml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - kraken2=2.0.8_beta \ No newline at end of file + - kraken2=2.0.8_beta diff --git a/workflow/envs/krona.yml b/workflow/envs/krona.yml index eb0bfce6..1596854c 100644 --- a/workflow/envs/krona.yml +++ b/workflow/envs/krona.yml @@ -3,4 +3,4 @@ channels: - defaults dependencies: - krona=2.7.1 - - make=4.2.1 \ No newline at end of file + - make=4.2.1 diff --git a/workflow/envs/maxbin.yml b/workflow/envs/maxbin.yml index 46fac4a1..5c9396c8 100644 --- a/workflow/envs/maxbin.yml +++ b/workflow/envs/maxbin.yml @@ -2,4 +2,4 @@ channels: - bioconda dependencies: - maxbin2=2.2.7 - - perl-lwp-simple=6.15 \ No newline at end of file + - perl-lwp-simple=6.15 diff --git a/workflow/envs/megahit.yml b/workflow/envs/megahit.yml index abe57440..2144d4a0 100644 --- a/workflow/envs/megahit.yml +++ b/workflow/envs/megahit.yml @@ -4,4 +4,4 @@ channels: - defaults dependencies: - python=3.8.1 - - megahit=1.2.9 \ No newline at end of file + - megahit=1.2.9 diff --git a/workflow/envs/metabat.yml b/workflow/envs/metabat.yml index 4b44d966..094da655 100644 --- a/workflow/envs/metabat.yml +++ b/workflow/envs/metabat.yml @@ -4,4 +4,4 @@ channels: - conda-forge - defaults dependencies: - - metabat2=2.14 \ No newline at end of file + - metabat2=2.14 diff --git a/workflow/envs/metaphlan.yml b/workflow/envs/metaphlan.yml index ab3e45a7..5b7adde4 100644 --- a/workflow/envs/metaphlan.yml +++ b/workflow/envs/metaphlan.yml @@ -5,4 +5,4 @@ channels: - defaults dependencies: - python=3.7.6 - - metaphlan=3.0 \ No newline at end of file + - metaphlan=3.0 diff --git a/workflow/envs/metaspades.yml b/workflow/envs/metaspades.yml index 36c49084..7c45e1c6 100644 --- a/workflow/envs/metaspades.yml +++ b/workflow/envs/metaspades.yml @@ -4,4 +4,4 @@ channels: - defaults dependencies: - python=3.8.1 - - spades=3.14.0 \ No newline at end of file + - spades=3.14.0 diff --git a/workflow/envs/plotting.yml b/workflow/envs/plotting.yml index 1ed9df4a..1d619ec2 100644 --- a/workflow/envs/plotting.yml +++ b/workflow/envs/plotting.yml @@ -6,4 +6,4 @@ dependencies: - python=3.8.2 - seaborn=0.10.1 - pandas=1.0.3 - - jupyter=1.0.0 \ No newline at end of file + - jupyter=1.0.0 diff --git a/workflow/envs/preprocess.yml b/workflow/envs/preprocess.yml index 0a876c94..d3cd44ff 100644 --- a/workflow/envs/preprocess.yml +++ b/workflow/envs/preprocess.yml @@ -10,4 +10,4 @@ dependencies: - fastqc=0.11.9 - multiqc=1.8 - fastuniq=1.1 - - bowtie2=2.3.5.1 \ No newline at end of file + - bowtie2=2.3.5.1 diff --git a/workflow/envs/quantify.yml b/workflow/envs/quantify.yml index 73d39cf7..0d4626f4 100644 --- a/workflow/envs/quantify.yml +++ b/workflow/envs/quantify.yml @@ -7,4 +7,4 @@ dependencies: - subread=2.0.0 - samtools=1.9 - picard=2.21.9 - - bowtie2=2.4.1 \ No newline at end of file + - bowtie2=2.4.1 From 63a564f7405958e71d9036e9a2e7837188b0971a Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:18:38 +0200 Subject: [PATCH 42/63] Formatting fixes after running pre-commit --- workflow/rules/assembly.smk | 16 ++++++++-------- workflow/rules/binning.smk | 26 +++++++++++++------------- workflow/rules/classification.smk | 12 ++++++------ workflow/rules/preprocessing.smk | 8 ++++---- workflow/rules/taxonomy.smk | 4 ++-- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk index 35890f2a..416bfacc 100644 --- a/workflow/rules/assembly.smk +++ b/workflow/rules/assembly.smk @@ -77,7 +77,7 @@ if config["assembly"]["metaspades"]: metaspades.py \ -t {threads} -1 {input.R1} -2 {input.R2} $single \ -o {params.tmp} > {log} 2>&1 - + # If set to keep intermediate contigs, move to intermediate folder before deleting if [ "{config[metaspades][keep_intermediate]}" == "True" ]; then mkdir -p {params.intermediate_contigs} @@ -87,7 +87,7 @@ if config["assembly"]["metaspades"]: mkdir -p {params.corrected} cp -r {params.tmp}/corrected {params.corrected} fi - + # Clear intermediate contigs rm -rf {params.tmp}/K* # Clear corrected reads dir @@ -162,20 +162,20 @@ else: else single="" fi - + # Run Megahit megahit -t {threads} $paired $single -o {params.tmp} \ {params.additional_settings} >{log} 2>&1 - + # Sync intermediate contigs if asked for if [ "{config[megahit][keep_intermediate]}" == "True" ]; then mkdir -p {params.intermediate_contigs} cp -r {params.tmp}/intermediate_contigs/* {params.intermediate_contigs} fi - + # Cleanup intermediate rm -rf {params.tmp}/intermediate_contigs - + # Sync tmp output to outdir before removing cp -r {params.tmp}/* {params.output_dir} rm -rf {params.tmp} @@ -277,7 +277,7 @@ rule bowtie_map_se: shell: """ bowtie2 {params.setting} -p {threads} -x {params.prefix} \ - -U {input.se} 2>{output.log} | samtools view -bh - | samtools sort - -o {params.temp_bam} + -U {input.se} 2>{output.log} | samtools view -bh - | samtools sort - -o {params.temp_bam} samtools index {params.temp_bam} mv {params.temp_bam} {output.bam} mv {params.temp_bam}.bai {output.bai} @@ -304,7 +304,7 @@ rule samtools_flagstat: "../envs/quantify.yml" shell: """ - for f in {input} ; + for f in {input} ; do al=$(samtools \ flagstat \ diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk index c341f678..ddb2f339 100644 --- a/workflow/rules/binning.smk +++ b/workflow/rules/binning.smk @@ -9,7 +9,7 @@ localrules: contig_map, binning_stats, aggregate_binning_stats, - download_checkm, + download_checkm, checkm_qa, remove_checkm_zerocols, aggregate_checkm_stats, @@ -118,12 +118,12 @@ rule maxbin: else # Rename fasta files ls {params.tmp_dir} | grep ".fasta" | while read f; - do + do mv {params.tmp_dir}/$f {params.dir}/${{f%.fasta}}.fa done # Move output from temporary dir ls {params.tmp_dir} | while read f; - do + do mv {params.tmp_dir}/$f {params.dir}/ done fi @@ -154,11 +154,11 @@ rule concoct_coverage_table: p=POSTPROCESS shell: """ - for f in {input.bam} ; - do - n=$(basename $f); - s=$(echo -e $n | sed 's/_[ps]e{params.p}.bam//g'); - echo $s; + for f in {input.bam} ; + do + n=$(basename $f); + s=$(echo -e $n | sed 's/_[ps]e{params.p}.bam//g'); + echo $s; done > {params.samplenames} concoct_coverage_table.py \ --samplenames {params.samplenames} \ @@ -422,7 +422,7 @@ rule checkm_qa: else checkm qa -o 2 --tab_table -f {output.tsv} \ {input.ms} {params.dir} > {log} 2>&1 - fi + fi """ rule checkm_coverage: @@ -572,7 +572,7 @@ rule gtdbtk_classify: rule aggregate_gtdbtk: """ - Aggregates GTDB-TK phylogenetic results from several assemblies into a + Aggregates GTDB-TK phylogenetic results from several assemblies into a single table. """ input: @@ -634,7 +634,7 @@ rule barrnap: barrnap --kingdom $k --quiet {params.indir}/$g.fa | \ egrep -v "^#" | sed "s/$/;genome=$g/g" >> {output} done - fi + fi """ rule count_rRNA: @@ -687,7 +687,7 @@ rule trnascan_bins: tRNAscan-SE $model --quiet --thread {threads} \ {params.indir}/$g.fa | tail -n +4 | sed "s/$/\t$g/g" >> {output} done - fi + fi """ rule count_tRNA: @@ -704,7 +704,7 @@ rule count_tRNA: rule aggregate_bin_annot: input: - trna=expand(opj(config["paths"]["results"], "binning", "{binner}", + trna=expand(opj(config["paths"]["results"], "binning", "{binner}", "{assembly}", "{l}", "tRNAscan", "tRNA.total.tsv"), binner=get_binners(config), assembly=assemblies.keys(), diff --git a/workflow/rules/classification.smk b/workflow/rules/classification.smk index 453ab6fc..db728d6c 100644 --- a/workflow/rules/classification.smk +++ b/workflow/rules/classification.smk @@ -141,7 +141,7 @@ rule download_centrifuge_build: tar -C {params.dir} -xf {params.tar} >>{log} 2>&1 rm {params.tar} """ - + rule centrifuge_pe: input: R1=opj(config["paths"]["results"], "intermediate", "preprocess", @@ -212,13 +212,13 @@ rule centrifuge_se: rule centrifuge_kreport: input: - f=opj(config["paths"]["results"], "centrifuge", + f=opj(config["paths"]["results"], "centrifuge", "{sample}_{unit}_{seq_type}.out"), db=expand(opj(config["centrifuge"]["dir"], - "{base}.{i}.cf"), + "{base}.{i}.cf"), i=[1, 2, 3], base=config["centrifuge"]["base"]) output: - opj(config["paths"]["results"], "centrifuge", + opj(config["paths"]["results"], "centrifuge", "{sample}_{unit}_{seq_type}.kreport") params: min_score=config["centrifuge"]["min_score"], @@ -306,7 +306,7 @@ rule metaphlan_se: resources: runtime=lambda wildcards, attempt: attempt**2*60*4 shell: - """ + """ metaphlan {input.se} --bowtie2db {params.dir} --add_viruses --force \ --nproc {threads} --input_type fastq -o {output.tsv} \ --bowtie2out {output.bt2} > {log} 2>&1 @@ -365,7 +365,7 @@ rule plot_metaphlan: "../envs/plotting.yml" notebook: "../notebooks/metaphlan.py.ipynb" - + ##### krona ##### rule krona_taxonomy: diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk index 55e532cb..32ab1110 100644 --- a/workflow/rules/preprocessing.smk +++ b/workflow/rules/preprocessing.smk @@ -131,13 +131,13 @@ rule sortmerna_fastq_pe: "../envs/preprocess.yml" shell: """ - mkdir -p {params.scratch} + mkdir -p {params.scratch} # Run SortMeRNA sortmerna --blast 1 --log -v --fastx --ref {params.ref_string} \ --reads {input.fastq} -a {threads} --{params.paired_strategy} \ --aligned {params.aligned_prefix} --other {params.other_prefix} \ {params.score_params} >{log} 2>&1 - + mv {params.aligned_prefix}.fastq {output.aligned} mv {params.aligned_prefix}.log {log} mv {params.other_prefix}.fastq {output.other} @@ -257,7 +257,7 @@ rule sortmerna_fastq_se: --reads {input.fastq} -a {threads} --other {params.other_prefix} \ --aligned {params.aligned_prefix} {params.score_params} \ >{log} 2>&1 - + mv {params.aligned_prefix}.fastq {output.aligned} mv {params.aligned_prefix}.log {log} mv {params.other_prefix}.fastq {output.other} @@ -360,7 +360,7 @@ rule trimmomatic_pe: 2>{log.R1log} sed \ 's/{wildcards.sample}_{wildcards.unit}_R1/{wildcards.sample}_{wildcards.unit}_R2/g' \ - {log.R1log} > {log.R2log} + {log.R1log} > {log.R2log} """ rule trimmomatic_se: diff --git a/workflow/rules/taxonomy.smk b/workflow/rules/taxonomy.smk index a5af315f..1db9ae81 100644 --- a/workflow/rules/taxonomy.smk +++ b/workflow/rules/taxonomy.smk @@ -125,7 +125,7 @@ rule contigtax_update: else cd {params.dir} ln -s $(basename {input.idmap}) $(basename {output.idmap}) - fi + fi """ rule contigtax_build: @@ -145,7 +145,7 @@ rule contigtax_build: shell: """ contigtax build -d {output} -p {threads} {input.fasta} \ - {input.idmap} {input.nodes} >{log} 2>&1 + {input.idmap} {input.nodes} >{log} 2>&1 """ rule contigtax_search: From 1895a046687d4e44735f03d7ac839ee300941034 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:19:56 +0200 Subject: [PATCH 43/63] Formatting fixes after running pre-commit --- workflow/rules/annotation.smk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 1a3c9e3e..67c5bd32 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -1,6 +1,6 @@ from scripts.common import annotation_input -localrules: +localrules: annotate, download_rfams, press_rfams, @@ -59,7 +59,7 @@ rule trnascan: shell: """ tRNAscan-SE -G -b {output.bed} -o {output.file} -a {output.fasta} \ - --thread {threads} {input} >{log} 2>&1 + --thread {threads} {input} >{log} 2>&1 """ rule download_rfams: @@ -85,11 +85,11 @@ rule download_rfams: # Extract only rfams of interest tar -C {params.dir} -zxf {output.tar} {params.rfams} cat {params.dir}/*.cm > {output.cm} - + # Get release curl -o {output.readme} {params.url}/README 2>/dev/null grep -m 1 Release {output.readme} > {output.version} - + # Get clans curl {params.url}/Rfam.clanin 2>/dev/null | egrep -w \ "CL0011[123]" > {output.clanin} @@ -287,7 +287,7 @@ rule emapper_homology_search: shell: """ mkdir -p {params.tmpdir} - emapper.py {params.flags} --cpu {threads} --temp_dir {params.tmpdir} \ + emapper.py {params.flags} --cpu {threads} --temp_dir {params.tmpdir} \ -i {input[0]} -o {params.out} --output_dir {params.tmpdir} \ --data_dir {params.resource_dir} >{log} 2>&1 mv {params.tmp_out}.emapper.seed_orthologs {output[0]} From e10586a3a88932633c233abe4e1ba2e14921ed35 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:22:20 +0200 Subject: [PATCH 44/63] Add pre-commit --- .editorconfig | 1 - .gitattributes | 1 - 2 files changed, 2 deletions(-) diff --git a/.editorconfig b/.editorconfig index baf8ae55..5573fb60 100644 --- a/.editorconfig +++ b/.editorconfig @@ -13,4 +13,3 @@ indent_size = 4 [*.{yml,yaml}] indent_style = space indent_size = 2 - diff --git a/.gitattributes b/.gitattributes index f3011696..74d81cc5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,2 @@ *.smk linguist-language=Python Snakefile linguist-language=Python - From dc33c7459982f7c244b0a6dc2622025b68a64da9 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:25:57 +0200 Subject: [PATCH 45/63] Add pre-commit badge --- README.md | 4 +- workflow/envs/edger.yml | 9 +++ .../envs/{normalize.yml => metagenomeseq.yml} | 6 +- workflow/scripts/common.R | 27 ++++++++ workflow/scripts/edger.R | 36 ++++++++++ workflow/scripts/metagenomeseq.R | 35 ++++++++++ workflow/scripts/normalize.R | 69 ------------------- 7 files changed, 113 insertions(+), 73 deletions(-) create mode 100644 workflow/envs/edger.yml rename workflow/envs/{normalize.yml => metagenomeseq.yml} (60%) create mode 100644 workflow/scripts/common.R create mode 100644 workflow/scripts/edger.R create mode 100644 workflow/scripts/metagenomeseq.R delete mode 100644 workflow/scripts/normalize.R diff --git a/README.md b/README.md index 4ffc68f8..7a13be70 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ A workflow for metagenomic projects [![Snakemake 5.11.2](https://img.shields.io/badge/snakemake-5.11.2-brightgreen.svg)](https://img.shields.io/badge/snakemake-5.11.2) ![CI](https://github.com/NBISweden/nbis-meta/workflows/CI/badge.svg?branch=master) ![Docker](https://img.shields.io/docker/pulls/nbisweden/nbis-meta) - +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) ## Overview A [snakemake](http://snakemake.readthedocs.io/en/stable/) workflow for @@ -19,7 +19,7 @@ You can use this workflow for _e.g._: - **functional and taxonomic annotation** - **metagenomic binning** -See the [Wiki-pages](https://github.com/NBISweden/nbis-meta/wiki) for +See the [Wiki-pages](https://github.com/NBISweden/nbis-meta/wiki) for instructions on how to run the workflow. ## Installation diff --git a/workflow/envs/edger.yml b/workflow/envs/edger.yml new file mode 100644 index 00000000..ec95a87f --- /dev/null +++ b/workflow/envs/edger.yml @@ -0,0 +1,9 @@ +name: edger +channels: + - conda-forge + - bioconda + - r + - defaults +dependencies: + - r-base + - bioconductor-edger=3.30.0 diff --git a/workflow/envs/normalize.yml b/workflow/envs/metagenomeseq.yml similarity index 60% rename from workflow/envs/normalize.yml rename to workflow/envs/metagenomeseq.yml index f60900c0..559d88d8 100644 --- a/workflow/envs/normalize.yml +++ b/workflow/envs/metagenomeseq.yml @@ -1,7 +1,9 @@ -name: normalize +name: metagenomeseq channels: + - conda-forge - bioconda + - r - defaults dependencies: - - bioconductor-edger + - r-base - bioconductor-metagenomeseq diff --git a/workflow/scripts/common.R b/workflow/scripts/common.R new file mode 100644 index 00000000..16dd914d --- /dev/null +++ b/workflow/scripts/common.R @@ -0,0 +1,27 @@ + +num_cols <- function(x) { + x <- x[, unlist(lapply(x, is.numeric))] + x +} + +str_cols <- function(x) { + x <- x[, unlist(lapply(x, is.character))] + x +} + +process_data <- function(input, output) { + # Read the counts + x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE) + # Remove unclassified + if ("Unclassified" %in% row.names(x)){ + x <- x[row.names(x)!="Unclassified", ] + } + if (nrow(x)==0) { + write.table(x, file = output, quote = FALSE, sep="\t") + quit() + } + + # Extract only numeric columns + x_num <- num_cols(x) + x_num +} diff --git a/workflow/scripts/edger.R b/workflow/scripts/edger.R new file mode 100644 index 00000000..ae5d3ed6 --- /dev/null +++ b/workflow/scripts/edger.R @@ -0,0 +1,36 @@ +#!/usr/bin/env Rscript + +library(edgeR) +source("common.R") + +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + +method <- snakemake@params$method +input <- snakemake@input[[1]] +output <- snakemake@output[[1]] + +x_num <- process_data(input, output) + +if (method %in% c("TMM", "RLE")) { + # Create DGE + obj <- DGEList(x_num) + # Calculate norm factors + obj <- calcNormFactors(obj, method = method) + # Calculate cpms + norm <- cpm(obj, normalized.lib.sizes = TRUE) +} else if (method == "RPKM") { + # Extract gene length column + gene_length <- x_num$Length + x_num <- x_num[, colnames(x_num) != "Length"] + obj <- DGEList(x_num) + # Calculate norm factors + obj <- calcNormFactors(obj, method = "TMM") + # Calculate RPKM + norm <- rpkm(obj, normalized.lib.sizes = TRUE, gene.length = gene_length) +} + +norm <- cbind(str_cols(x), norm) + +write.table(x = norm, file = output, quote = FALSE, sep="\t") diff --git a/workflow/scripts/metagenomeseq.R b/workflow/scripts/metagenomeseq.R new file mode 100644 index 00000000..3fbf4732 --- /dev/null +++ b/workflow/scripts/metagenomeseq.R @@ -0,0 +1,35 @@ +#!/usr/bin/env Rscript + +library(metagenomeSeq) +source("common.R") + +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + +method <- snakemake@params$method +input <- snakemake@input[[1]] +output <- snakemake@output[[1]] + +x_num <- process_data(input, output) + +obj <- newMRexperiment(x_num) +smat = lapply(1:ncol(x_num), function(i) { + sort(x_num[which(x_num[, i]>0),i], decreasing = TRUE) +}) +if(any(sapply(smat,length)==1)) { + fh <-file(snakemake@output[[1]]) + writeLines(c("WARNING: Sample with one or zero features", + "Cumulative Sum Scaling failed for sample"), fh) + close(fh) + quit() +} +norm <- MRcounts(obj, norm = TRUE) + +norm <- cbind(str_cols(x), norm) + +write.table(x = norm, file = output, quote = FALSE, sep="\t") diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R deleted file mode 100644 index 66bdb489..00000000 --- a/workflow/scripts/normalize.R +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env Rscript - -log <- file(snakemake@log[[1]], open="wt") -sink(log) -sink(log, type="message") - -num_cols <- function(x) { - x <- x[, unlist(lapply(x, is.numeric))] - x -} - -str_cols <- function(x) { - x <- x[, unlist(lapply(x, is.character))] - x -} - -method <- snakemake@params$method - -# Read the counts -x <- read.delim(snakemake@input[[1]], row.names = 1, sep = "\t", header = TRUE) -# Remove unclassified -if ("Unclassified" %in% row.names(x)){ - x <- x[row.names(x)!="Unclassified", ] -} -if (nrow(x)==0) { - write.table(x, file = snakemake@output[[1]], quote = FALSE, sep="\t") - quit() -} - -# Extract only numeric columns -x_num <- num_cols(x) - -if (method %in% c("TMM", "RLE")) { - library(edgeR) - # Create DGE - obj <- DGEList(x_num) - # Calculate norm factors - obj <- calcNormFactors(obj, method = method) - # Calculate cpms - norm <- cpm(obj, normalized.lib.sizes = TRUE) -} else if (method == "RPKM") { - library(edgeR) - # Extract gene length column - gene_length <- x_num$Length - x_num <- x_num[, colnames(x_num) != "Length"] - obj <- DGEList(x_num) - # Calculate norm factors - obj <- calcNormFactors(obj, method = "TMM") - # Calculate RPKM - norm <- rpkm(obj, normalized.lib.sizes = TRUE, gene.length = gene_length) -} else { - library(metagenomeSeq) - obj <- newMRexperiment(x_num) - smat = lapply(1:ncol(x_num), function(i) { - sort(x_num[which(x_num[, i]>0),i], decreasing = TRUE) - }) - if(any(sapply(smat,length)==1)) { - fh <-file(snakemake@output[[1]]) - writeLines(c("WARNING: Sample with one or zero features", - "Cumulative Sum Scaling failed for sample"), fh) - close(fh) - quit() - } - norm <- MRcounts(obj, norm = TRUE) -} - -norm <- cbind(str_cols(x), norm) - -write.table(x = norm, file = snakemake@output[[1]], quote = FALSE, sep="\t") From 557419a60272982eee50c001e577d5e44fa7070a Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:26:37 +0200 Subject: [PATCH 46/63] Fix whitespace --- workflow/scripts/eggnog-parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/eggnog-parser.py b/workflow/scripts/eggnog-parser.py index 292ebbca..1875593f 100644 --- a/workflow/scripts/eggnog-parser.py +++ b/workflow/scripts/eggnog-parser.py @@ -335,7 +335,7 @@ def main(): help="emapper.py annotation files (Typically *.emapper.annotations)") annot_parser.add_argument("outdir", help="Output directory for parsed files") - annot_parser.add_argument("--map_go", action="store_true", + annot_parser.add_argument("--map_go", action="store_true", help="Also map GO terms (can take a long time)") annot_parser.set_defaults(func=parse) # Sum annotations parser From 3ca01b78a12820594a1df53ca7fceec9ed30d0c2 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:28:26 +0200 Subject: [PATCH 47/63] Merge RGI testing with metaspades --- .github/workflows/main.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c97a0034..55089cfd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,12 +50,9 @@ jobs: run: | snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport - name: Metaspades + RGI - run: | - snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p assemble - rm -r results/assembly results/report examples/data/sample* - - name: RGI run: | snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate + rm -r results/assembly results/report examples/data/sample* - name: Prepare taxonomy run: bash .test/scripts/prep_taxonomy.sh - name: Taxonomy From d7586660b2cf45d2a3bdb22129fb2b2e4573fc8f Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:28:49 +0200 Subject: [PATCH 48/63] Fix whitespace --- .test/README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.test/README.md b/.test/README.md index 862d0942..01ce1483 100644 --- a/.test/README.md +++ b/.test/README.md @@ -1,24 +1,23 @@ # Testing -This directory contains config files, resources and scripts used to test the +This directory contains config files, resources and scripts used to test the workflow. **config/** -Files under `config/` are used by the different test steps of the github +Files under `config/` are used by the different test steps of the github actions testing. **data/** The fasta file at `data/uniref100.fasta` was created by: -1. searching [uniprot](https://uniprot.org) for the 5 taxids used to generate +1. searching [uniprot](https://uniprot.org) for the 5 taxids used to generate the [synthetic metagenome](https://zenodo.org/record/3737112#.XsUQncZ8LOQ) that this workflow uses for testing. This resulted in 8,122 identified sequences. 2. mapping the sequences to their UniRef100 id via the Retrieve/ID mapping tool at uniprot, followed by downloading of the reference sequences. 3. subsampling 100 sequences from the downloaded fastafile using `seqtk -During testing the fasta file is used to build and query a diamond database -using `tango`. - +During testing the fasta file is used to build and query a diamond database +using `tango`. From 017ce3b1afce8c4d33311bde3bba238735387511 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:29:54 +0200 Subject: [PATCH 49/63] Fix whitespace --- slurm/slurm_utils.py | 4 ++-- workflow/.cfg/multiqc_preprocess_config.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm/slurm_utils.py b/slurm/slurm_utils.py index a420d1c3..a8b5e47b 100644 --- a/slurm/slurm_utils.py +++ b/slurm/slurm_utils.py @@ -59,7 +59,7 @@ def format(_pattern, _quote_all=False, **kwargs): # adapted from Job.format_wildcards in snakemake.jobs def format_wildcards(string, job_properties): """ Format a string with variables from the job. """ - + class Job(object): def __init__(self, job_properties): for key in job_properties: @@ -100,7 +100,7 @@ def format_values(dictionary, job_properties): ) raise WorkflowError(msg, e) return formatted - + def convert_job_properties(job_properties, resource_mapping={}): options = {} resources = job_properties.get("resources", {}) diff --git a/workflow/.cfg/multiqc_preprocess_config.yaml b/workflow/.cfg/multiqc_preprocess_config.yaml index c1bdb4d9..3deab64b 100644 --- a/workflow/.cfg/multiqc_preprocess_config.yaml +++ b/workflow/.cfg/multiqc_preprocess_config.yaml @@ -9,4 +9,4 @@ extra_fn_clean_exts: - '_PHIX' - '.sortmerna' - '_R1' - - '_R2' \ No newline at end of file + - '_R2' From 77a932a190c1e49a739da3587767d5197f152f2f Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:37:29 +0200 Subject: [PATCH 50/63] Update path to common.R --- workflow/scripts/edger.R | 5 ++--- workflow/scripts/metagenomeseq.R | 8 ++------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/workflow/scripts/edger.R b/workflow/scripts/edger.R index ae5d3ed6..5b09a870 100644 --- a/workflow/scripts/edger.R +++ b/workflow/scripts/edger.R @@ -1,12 +1,11 @@ #!/usr/bin/env Rscript -library(edgeR) -source("common.R") - log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") +source("workflow/scripts/common.R") +library(edgeR) method <- snakemake@params$method input <- snakemake@input[[1]] output <- snakemake@output[[1]] diff --git a/workflow/scripts/metagenomeseq.R b/workflow/scripts/metagenomeseq.R index 3fbf4732..fbcfe955 100644 --- a/workflow/scripts/metagenomeseq.R +++ b/workflow/scripts/metagenomeseq.R @@ -1,15 +1,11 @@ #!/usr/bin/env Rscript -library(metagenomeSeq) -source("common.R") - log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") -log <- file(snakemake@log[[1]], open="wt") -sink(log) -sink(log, type="message") +library(metagenomeSeq) +source("workflow/scripts/common.R") method <- snakemake@params$method input <- snakemake@input[[1]] From 7998fc597deb543021feaaf50c226a30d5904e15 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 12:54:53 +0200 Subject: [PATCH 51/63] Split metagenomeseq and edger into separate scripts to help with conda installation --- workflow/rules/common.smk | 2 +- workflow/rules/quantification.smk | 30 +++++++++++++++++++++++------- workflow/scripts/common.R | 9 +-------- workflow/scripts/edger.R | 9 ++++++++- workflow/scripts/metagenomeseq.R | 8 +++++++- 5 files changed, 40 insertions(+), 18 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 5ec04ede..7012e079 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -37,7 +37,7 @@ wildcard_constraints: group="\w+", l="\d+", counts_type="(counts|rpkm)", - norm_method="(TMM|CSS|RLE)" + norm_method="(TMM|RLE)" from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 95bc40f8..aa656363 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -5,7 +5,8 @@ localrules: aggregate_featurecount, rpkm, count_features, - normalize_features, + edger_normalize_features, + css_normalize_features, sum_to_taxa ##### quantify master rule ##### @@ -137,9 +138,9 @@ rule rpkm: params: method = "RPKM" conda: - "../envs/normalize.yml" + "../envs/edger.yml" script: - "../scripts/normalize.R" + "../scripts/edger.R" rule count_features: """ @@ -153,9 +154,9 @@ rule count_features: script: "../scripts/quantification_utils.py" -rule normalize_features: +rule edger_normalize_features: """ - Normalizes counts of features using TMM, REL or CSS + Normalizes counts of features using TMM and REL """ input: opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv") @@ -166,9 +167,24 @@ rule normalize_features: params: method = "{norm_method}" conda: - "../envs/normalize.yml" + "../envs/edger.yml" script: - "../scripts/normalize.R" + "../scripts/edger.R" + +rule css_normalize_features: + """ + Normalizes counts of features using CSS from metagenomeSeq + """ + input: + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv") + output: + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.CSS.tsv") + log: + opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.CSS.log") + conda: + "../envs/metagenomeseq.yml" + script: + "../scripts/metagenomeseq.R" rule sum_to_taxa: """ diff --git a/workflow/scripts/common.R b/workflow/scripts/common.R index 16dd914d..a08c3698 100644 --- a/workflow/scripts/common.R +++ b/workflow/scripts/common.R @@ -9,18 +9,11 @@ str_cols <- function(x) { x } -process_data <- function(input, output) { - # Read the counts - x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE) - # Remove unclassified - if ("Unclassified" %in% row.names(x)){ - x <- x[row.names(x)!="Unclassified", ] - } +process_data <- function(x, output) { if (nrow(x)==0) { write.table(x, file = output, quote = FALSE, sep="\t") quit() } - # Extract only numeric columns x_num <- num_cols(x) x_num diff --git a/workflow/scripts/edger.R b/workflow/scripts/edger.R index 5b09a870..5c9ac1df 100644 --- a/workflow/scripts/edger.R +++ b/workflow/scripts/edger.R @@ -10,7 +10,14 @@ method <- snakemake@params$method input <- snakemake@input[[1]] output <- snakemake@output[[1]] -x_num <- process_data(input, output) +# Read the counts +x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE) +# Remove unclassified +if ("Unclassified" %in% row.names(x)){ + x <- x[row.names(x)!="Unclassified", ] +} + +x_num <- process_data(x, output) if (method %in% c("TMM", "RLE")) { # Create DGE diff --git a/workflow/scripts/metagenomeseq.R b/workflow/scripts/metagenomeseq.R index fbcfe955..1fab5f04 100644 --- a/workflow/scripts/metagenomeseq.R +++ b/workflow/scripts/metagenomeseq.R @@ -10,8 +10,14 @@ source("workflow/scripts/common.R") method <- snakemake@params$method input <- snakemake@input[[1]] output <- snakemake@output[[1]] +# Read the counts +x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE) +# Remove unclassified +if ("Unclassified" %in% row.names(x)){ + x <- x[row.names(x)!="Unclassified", ] +} -x_num <- process_data(input, output) +x_num <- process_data(x, output) obj <- newMRexperiment(x_num) smat = lapply(1:ncol(x_num), function(i) { From 2777be7a33f590fd596c46e2ab7c0585cb6227c2 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 14:22:58 +0200 Subject: [PATCH 52/63] Remove trailing '*' in protein fasta --- workflow/rules/annotation.smk | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 67c5bd32..c8f1b9f6 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -411,7 +411,9 @@ rule rgi: opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.log") params: out=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out"), - settings="-a diamond --local --clean --input_type protein" + settings="-a diamond --local --clean --input_type protein", + faa=opj(config["paths"]["temp"], "{assembly}.rgi", "final_contig.faa"), + tmpdir=opj(config["paths"]["temp"], "{assembly}.rgi") shadow: "minimal" conda: "../envs/rgi.yml" @@ -420,9 +422,12 @@ rule rgi: runtime=lambda wildcards, attempt: attempt**2*60 shell: """ + mkdir -p {params.tmpdir} rgi load -i {input.db} --local > {log} 2>&1 - rgi main -i {input.faa} -o {params.out} \ + sed 's/*//g' {input.faa} > {params.faa} + rgi main -i {params.faa} -o {params.out} \ -n {threads} {params.settings} >>{log} 2>>{log} + rm -r {params.tmpdir} """ rule parse_rgi: From 9095a828b02e4b076cc74176c393499eb72295ed Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 14:59:44 +0200 Subject: [PATCH 53/63] Add testing for eggnog-mapper as well --- .github/workflows/main.yml | 3 ++- .test/config/metaspades.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 55089cfd..6c8dfd4c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -49,8 +49,9 @@ jobs: - name: Kraken run: | snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport - - name: Metaspades + RGI + - name: Metaspades + annotation run: | + bash .test/scripts/prep_eggnog.sh snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate rm -r results/assembly results/report examples/data/sample* - name: Prepare taxonomy diff --git a/.test/config/metaspades.yaml b/.test/config/metaspades.yaml index e619bec8..f93d9174 100644 --- a/.test/config/metaspades.yaml +++ b/.test/config/metaspades.yaml @@ -119,7 +119,7 @@ annotation: # run infernal for rRNA identification? infernal: False # run eggnog-mapper to infer KEGG orthologs, pathways and modules? - eggnog: False + eggnog: True # run PFAM-scan to infer protein families from PFAM? pfam: False # run Resistance gene identifier? From a3fb03e148b8fb61e1a8b367aee7e42ae3b87d63 Mon Sep 17 00:00:00 2001 From: johnne Date: Tue, 13 Oct 2020 15:00:22 +0200 Subject: [PATCH 54/63] Add script for prepping test db for eggnog-mapper --- .test/scripts/prep_eggnog.sh | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100755 .test/scripts/prep_eggnog.sh diff --git a/.test/scripts/prep_eggnog.sh b/.test/scripts/prep_eggnog.sh new file mode 100755 index 00000000..6ffac7a5 --- /dev/null +++ b/.test/scripts/prep_eggnog.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +base="https://github.com/eggnogdb/eggnog-mapper/blob/master/tests/fixtures" + +mkdir -p resources/eggnog-mapper +curl -L -o resources/eggnog-mapper/eggnog.db "${base}/eggnog.db?raw=true" +curl -L -o resources/eggnog-mapper/eggnog_proteins.dmnd "${base}/eggnog_proteins.dmnd?raw=true" +touch resources/eggnog-mapper/eggnog.version From fefc773ba00d35f2f552527e6384cb4345c9b241 Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 14 Oct 2020 10:59:21 +0200 Subject: [PATCH 55/63] Add TMPDIR to env for Github Actions --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6c8dfd4c..a68c2990 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,6 +13,8 @@ on: - 'LICENSE' branches: - master +env: + TMPDIR: /tmp jobs: test: From bb2f532e521a157b73afae12ef9e85b97a3a2cbe Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 14 Oct 2020 11:29:56 +0200 Subject: [PATCH 56/63] Set phiX filtering to false for testing --- .test/config/metaspades.yaml | 2 +- .test/config/preprocess.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.test/config/metaspades.yaml b/.test/config/metaspades.yaml index f93d9174..aa5444ad 100644 --- a/.test/config/metaspades.yaml +++ b/.test/config/metaspades.yaml @@ -20,7 +20,7 @@ preprocessing: # run fastuniq (removes duplicates from paired-end samples) fastuniq: True # map reads agains the phix genome and keep only reads that do not map concordantly - phix_filter: True + phix_filter: False # run SortMeRNA to identify (and filter) rRNA sequences sortmerna: True diff --git a/.test/config/preprocess.yaml b/.test/config/preprocess.yaml index a2a2e6b7..4d56d3d9 100644 --- a/.test/config/preprocess.yaml +++ b/.test/config/preprocess.yaml @@ -20,7 +20,7 @@ preprocessing: # run fastuniq (removes duplicates from paired-end samples) fastuniq: True # map reads agains the phix genome and keep only reads that do not map concordantly - phix_filter: True + phix_filter: False # run SortMeRNA to identify (and filter) rRNA sequences sortmerna: True From f2315c9efc04e0b17264429c25b15daa7932e5a0 Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 14 Oct 2020 14:12:49 +0200 Subject: [PATCH 57/63] Remove message from sample linking --- workflow/rules/preprocessing.smk | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk index 32ab1110..b06645e6 100644 --- a/workflow/rules/preprocessing.smk +++ b/workflow/rules/preprocessing.smk @@ -27,7 +27,6 @@ rule link_files: lambda wildcards: samples[wildcards.sample][wildcards.unit][wildcards.pair] output: opj(config["paths"]["results"], "intermediate", "preprocess", "{sample}_{unit}_{pair}.fastq.gz") - message: "Linking {wildcards.sample}_{wildcards.unit}_{wildcards.pair}.fastq.gz" run: link(input[0], output[0]) From 721f7375e257456916e3f19e0b6a5ffc51148a23 Mon Sep 17 00:00:00 2001 From: johnne Date: Wed, 14 Oct 2020 16:44:10 +0200 Subject: [PATCH 58/63] Update config files for testing --- .github/workflows/main.yml | 25 +++++++++++++++++-------- .test/config/metabat.yaml | 2 +- .test/config/preprocess.yaml | 2 +- .test/config/taxonomy.yaml | 10 +++++----- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a68c2990..9dea8b2c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -27,45 +27,50 @@ jobs: - uses: s-weigand/setup-conda@v1 - name: Install the conda environment run: conda env update -n base -f environment.yml + # First dry-run on full workflow - name: Dry run run: snakemake --use-conda -j 4 --configfile .test/config/dry-run.yaml -n + # Test cutadapt (10k reads/sample) - name: Cutadapt run: | snakemake --use-conda -j 4 --configfile .test/config/cutadapt.yaml -p --notemp qc snakemake -j 1 --configfile .test/config/cutadapt.yaml --report report.html qc + # Upload sample report - name: Upload cutadapt report uses: actions/upload-artifact@v1 with: name: ${{ runner.os }}-cutadapt.html path: report.html + # Test all other preprocessing software except cutadapt (10k reads/sample) - name: Preprocess run: | rm -rf results report.html snakemake --use-conda -j 4 --configfile .test/config/preprocess.yaml -p qc snakemake --use-conda -j 4 --configfile .test/config/preprocess.yaml -p --report report.html qc + # Upload samples report - name: Upload samples report uses: actions/upload-artifact@v1 with: name: ${{ runner.os }}-samples_report.html path: report.html + # Run kraken for sample1 (paired-end) and sample4 (single-end) (10k reads/sample) - name: Kraken run: | snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport - - name: Metaspades + annotation + rm -rf results examples/data/sample* + # Test annotations + normalizations with metaspades (100k reads/sample) + - name: Annotation run: | bash .test/scripts/prep_eggnog.sh - snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate + bash .test/scripts/prep_taxonomy.sh + snakemake --use-conda -j 4 --configfile .test/config/annotate.yaml -p annotate rm -r results/assembly results/report examples/data/sample* - - name: Prepare taxonomy - run: bash .test/scripts/prep_taxonomy.sh - - name: Taxonomy - run: | - snakemake --use-conda -j 4 --configfile .test/config/taxonomy.yaml -p taxonomy - rm -r results examples/data/sample* + # Test Metabat2 with Megahit (200k reads/sample) - name: Metabat run: | snakemake --use-conda -j 4 --configfile .test/config/metabat.yaml -p bin assemble snakemake --use-conda -j 4 --configfile .test/config/metabat.yaml -p --report report.html bin assemble + # Upload report - name: Upload snakemake report uses: actions/upload-artifact@v1 with: @@ -89,16 +94,20 @@ jobs: restore-keys: | ${{ runner.os }}-${{ env.cache-name }} ${{ runner.os }}- + # Run kraken and produce final output reports (linux only) (10k reads/sample) - name: Kraken run: | snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml --notemp -p classify rm -r examples/data/sample* results + # Run binning including also checkm (200k reads/sample) - name: Binning run: | snakemake --use-conda -j 4 --configfile .test/config/binning.yaml -p assemble bin + # Create report for assembly and binning - name: Snakemake report run: | snakemake --use-conda -j 4 --configfile .test/config/binning.yaml -p --report report.html assemble bin + # Upload report - name: Upload snakemake report uses: actions/upload-artifact@v1 with: diff --git a/.test/config/metabat.yaml b/.test/config/metabat.yaml index 03f0e3d8..14196c45 100644 --- a/.test/config/metabat.yaml +++ b/.test/config/metabat.yaml @@ -16,7 +16,7 @@ preprocessing: # trim reads with trimmomatic? (quality and adapter trimming) trimmomatic: False # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming) - cutadapt: True + cutadapt: False # run fastuniq (removes duplicates from paired-end samples) fastuniq: False # map reads agains the phix genome and keep only reads that do not map concordantly diff --git a/.test/config/preprocess.yaml b/.test/config/preprocess.yaml index 4d56d3d9..a2a2e6b7 100644 --- a/.test/config/preprocess.yaml +++ b/.test/config/preprocess.yaml @@ -20,7 +20,7 @@ preprocessing: # run fastuniq (removes duplicates from paired-end samples) fastuniq: True # map reads agains the phix genome and keep only reads that do not map concordantly - phix_filter: False + phix_filter: True # run SortMeRNA to identify (and filter) rRNA sequences sortmerna: True diff --git a/.test/config/taxonomy.yaml b/.test/config/taxonomy.yaml index 8f6562cd..2f8942b1 100644 --- a/.test/config/taxonomy.yaml +++ b/.test/config/taxonomy.yaml @@ -12,9 +12,9 @@ paths: preprocessing: # run fastqc on (preprocessed) input? # if no preprocessing is done, fastqc will be run on the raw reads - fastqc: True + fastqc: False # trim reads with trimmomatic? (quality and adapter trimming) - trimmomatic: True + trimmomatic: False # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming) cutadapt: False # run fastuniq (removes duplicates from paired-end samples) @@ -87,13 +87,13 @@ sortmerna: extra_settings: "--num_alignments 1" # remove duplicates from bam files? -remove_duplicates: True +remove_duplicates: False assembly: # run Megahit assembler? - megahit: False + megahit: True # Use Metaspades instead of Megahit for assembly? - metaspades: True + metaspades: False megahit: # maximum threads for megahit From 806f6a677b2c62284ed183c36b372f3cae33eb56 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 15 Oct 2020 09:57:39 +0200 Subject: [PATCH 59/63] Rename test config --- .test/config/{metaspades.yaml => annotate.yaml} | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) rename .test/config/{metaspades.yaml => annotate.yaml} (98%) diff --git a/.test/config/metaspades.yaml b/.test/config/annotate.yaml similarity index 98% rename from .test/config/metaspades.yaml rename to .test/config/annotate.yaml index aa5444ad..45643473 100644 --- a/.test/config/metaspades.yaml +++ b/.test/config/annotate.yaml @@ -14,15 +14,15 @@ preprocessing: # if no preprocessing is done, fastqc will be run on the raw reads fastqc: False # trim reads with trimmomatic? (quality and adapter trimming) - trimmomatic: True + trimmomatic: False # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming) cutadapt: False # run fastuniq (removes duplicates from paired-end samples) - fastuniq: True + fastuniq: False # map reads agains the phix genome and keep only reads that do not map concordantly phix_filter: False # run SortMeRNA to identify (and filter) rRNA sequences - sortmerna: True + sortmerna: False # parameters for trimmomatic trimmomatic: @@ -87,11 +87,11 @@ sortmerna: extra_settings: "--num_alignments 1" # remove duplicates from bam files? -remove_duplicates: True +remove_duplicates: False assembly: # run Megahit assembler? - megahit: True + megahit: False # Use Metaspades instead of Megahit for assembly? metaspades: True @@ -125,7 +125,7 @@ annotation: # run Resistance gene identifier? rgi: True # run taxonomic annotation of assembled contigs (using tango + sourmash)? - taxonomy: False + taxonomy: True # params for taxonomic annotation of contigs/orfs taxonomy: @@ -262,4 +262,4 @@ metaphlan: # this sets the number of reads to generate for example input files # the files are generated based on the config/samples.tsv file and stored # under examples/data -example_dataset_size: 10000 +example_dataset_size: 100000 From bd85465beb9b5849549fb366e9656d605c9c4332 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 15 Oct 2020 10:54:51 +0200 Subject: [PATCH 60/63] Minor update to github workflow --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9dea8b2c..4780c750 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -64,7 +64,7 @@ jobs: bash .test/scripts/prep_eggnog.sh bash .test/scripts/prep_taxonomy.sh snakemake --use-conda -j 4 --configfile .test/config/annotate.yaml -p annotate - rm -r results/assembly results/report examples/data/sample* + rm -r results/assembly examples/data/sample* # Test Metabat2 with Megahit (200k reads/sample) - name: Metabat run: | From d9ba7cf5c14db0a02ecf0c3253e28570abd02cee Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 15 Oct 2020 12:16:47 +0200 Subject: [PATCH 61/63] Set markduplicates to True for annotation testing --- .test/config/annotate.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config/annotate.yaml b/.test/config/annotate.yaml index 45643473..0418d8b3 100644 --- a/.test/config/annotate.yaml +++ b/.test/config/annotate.yaml @@ -87,7 +87,7 @@ sortmerna: extra_settings: "--num_alignments 1" # remove duplicates from bam files? -remove_duplicates: False +remove_duplicates: True assembly: # run Megahit assembler? From b29407490c836fae6ba4bb9d0205c0bf4abc3c57 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 15 Oct 2020 12:30:19 +0200 Subject: [PATCH 62/63] Run annotation with 2 cores --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4780c750..ab50bb07 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -63,7 +63,7 @@ jobs: run: | bash .test/scripts/prep_eggnog.sh bash .test/scripts/prep_taxonomy.sh - snakemake --use-conda -j 4 --configfile .test/config/annotate.yaml -p annotate + snakemake --use-conda -j 2 --configfile .test/config/annotate.yaml -p annotate rm -r results/assembly examples/data/sample* # Test Metabat2 with Megahit (200k reads/sample) - name: Metabat From eabf34fdffd2e741d0a10cf2f72b1c5b1e60e601 Mon Sep 17 00:00:00 2001 From: johnne Date: Thu, 15 Oct 2020 12:30:40 +0200 Subject: [PATCH 63/63] Don't run markduplicates for metabat testing --- .test/config/metabat.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config/metabat.yaml b/.test/config/metabat.yaml index 14196c45..ecc8c616 100644 --- a/.test/config/metabat.yaml +++ b/.test/config/metabat.yaml @@ -87,7 +87,7 @@ sortmerna: extra_settings: "--num_alignments 1" # remove duplicates from bam files? -remove_duplicates: True +remove_duplicates: False assembly: # run Megahit assembler?