From 686daa7e4655bdcd38386f7143c60cbd7d3c1589 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 28 Sep 2020 00:03:30 +0200
Subject: [PATCH 01/63] Fix KeyError on adding new units

---
 workflow/scripts/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index a44bc791..c3754dc2 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -110,6 +110,8 @@ def parse_samples(df, config, PREPROCESS):
             for a in assem_list:
                 if sample not in assemblies[a].keys():
                     assemblies[a][sample] = {unit: {}}
+                if unit not in assemblies[a][sample].keys():
+                    assemblies[a][sample][unit] = {}
                 if r2:
                     assemblies[a][sample][unit]["R1"] = [R1_p]
                     assemblies[a][sample][unit]["R2"] = [R2_p]

From ccde58a9ac51946a0350009b0bae55ca8885fa2c Mon Sep 17 00:00:00 2001
From: John Sundh <john.sundh@scilifelab.se>
Date: Mon, 28 Sep 2020 00:20:36 +0200
Subject: [PATCH 02/63] Fix relative path to eggnog-parser.py

---
 workflow/rules/annotation.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index 95879870..c51dd2f5 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -255,7 +255,7 @@ rule get_kegg_info:
         opj("resources", "kegg", "download.log")
     params:
         outdir=lambda w, output: os.path.dirname(output[0]),
-        src="../scripts/eggnog-parser.py"
+        src="workflow/scripts/eggnog-parser.py"
     shell:
         """
         python {params.src} download {params.outdir} > {log} 2>&1
@@ -371,7 +371,7 @@ rule parse_ko_annotations:
     params:
         outbase=opj(config["paths"]["results"], "annotation", "{assembly}"),
         resource_dir=opj("resources", "kegg"),
-        src="../scripts/eggnog-parser.py"
+        src="workflow/scripts/eggnog-parser.py"
     shell:
         """
         python {params.src} parse {params.resource_dir} {input.annotations} \

From 938bbe544fe85745418a0e4b4aa7f3613c687bdd Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 28 Sep 2020 08:30:29 +0200
Subject: [PATCH 03/63] Increase runtime for pfam_scan

---
 workflow/rules/annotation.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index c51dd2f5..fd60c200 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -207,7 +207,7 @@ rule pfam_scan:
         tmp_out=opj(config["paths"]["temp"], "{assembly}.pfam.out")
     threads: 2
     resources:
-        runtime=lambda wildcards, attempt: attempt**2*60*4
+        runtime=lambda wildcards, attempt: attempt**2*60*10
     shell:
         """
         pfam_scan.pl -fasta {input[0]} -dir {params.dir} -cpu {threads} \

From c44492de59a7b0bfc189c1208c38d32704b4fc6b Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 28 Sep 2020 14:40:52 +0200
Subject: [PATCH 04/63] Remove utility function for markduplicates

---
 workflow/scripts/common.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index c3754dc2..a79b5709 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -664,21 +664,6 @@ def parse_cmout(f):
 
 ## Quantification functions
 
-def markdup_mem(wildcards, cores):
-    """
-    Calculates the memory to allocate when running MarkDuplicates
-    :param wildcards:
-    :param cores: number of cores for currently running workflow
-    :return:
-    """
-    if cores is not None:
-        threads = min(cores, 10)
-    else:
-        threads = 1
-    mem_gb_per_thread = 2
-    return int(mem_gb_per_thread * threads)
-
-
 def get_fc_files(wildcards, file_type):
     g = wildcards.group
     files = []

From 0975afb4c5149494f35050ab00503b91830841c2 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 28 Sep 2020 14:41:38 +0200
Subject: [PATCH 05/63] Don't remove symlinks in fastANI rule

---
 workflow/rules/binning.smk | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk
index e7450e8f..232f1be1 100644
--- a/workflow/rules/binning.smk
+++ b/workflow/rules/binning.smk
@@ -778,7 +778,6 @@ rule fastANI:
         fastANI --rl {input[0]} --ql {input[1]} -k {params.k} -t {threads} \
             --fragLen {params.frag_len} --minFraction {params.fraction} \
             --matrix -o {output[0]} > {log} 2>&1
-        rm {params.indir}/*.fa {params.indir}/*.fna
         """
 
 rule cluster_genomes:

From 3caa6b79bdfb3a33ce844f1751d2ceac32725a7e Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 30 Sep 2020 14:44:54 +0200
Subject: [PATCH 06/63] Add workaround for MarkDuplicates PG error

picard-tools MarkDuplicates threw an error when parsing the '@PG' key in the sam header. This workaround creates a temporary bam file with the PG key removed before running MarkDuplicates
---
 workflow/rules/quantification.smk | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 22c741d4..4f64dc5c 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -1,5 +1,3 @@
-from scripts.common import markdup_mem
-
 localrules:
     quantify,
     write_featurefile,
@@ -41,29 +39,37 @@ rule remove_mark_duplicates:
     log:
         opj(config["paths"]["results"], "assembly", "{assembly}", "mapping", "{sample}_{unit}_{seq_type}.markdup.log")
     params:
+        header=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.header"),
+        rehead_bam=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.rehead.bam"),
         temp_bam=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.markdup.bam"),
         temp_sort_bam=opj(config["paths"]["temp"], "{assembly}", "{sample}_{unit}_{seq_type}.markdup.re_sort.bam"),
         temp_dir=opj(config["paths"]["temp"], "{assembly}")
     threads: 10
     resources:
-        runtime=lambda wildcards, attempt: attempt**2*60*4,
-        mem_mb=lambda wildcards: markdup_mem(wildcards, workflow.cores)
+        runtime=lambda wildcards, attempt: attempt**2*60*4
     conda:
         "../envs/quantify.yml"
     shell:
         """
         mkdir -p {params.temp_dir}
-        java -Xms2g -Xmx{resources.mem_mb}g -XX:ParallelGCThreads={threads} \
+        # Fix bam header
+        samtools view -H {input} | egrep -v "^@PG" > {params.header}
+        samtools reheader -P {params.header} {input} > {params.rehead_bam}
+        # Set memory max
+        mem="-Xmx$((6 * {threads}))g"
+        java -Xms2g $mem -XX:ParallelGCThreads={threads} \
             -jar $CONDA_PREFIX/share/picard-*/picard.jar MarkDuplicates \
-            I={input} M={output[2]} O={params.temp_bam} REMOVE_DUPLICATES=TRUE \
-            USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ASO=coordinate 2> {log}
+            I={params.rehead_bam} M={output[2]} O={params.temp_bam} REMOVE_DUPLICATES=TRUE \
+            USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ASSUME_SORT_ORDER=coordinate \
+            PROGRAM_RECORD_ID=null ADD_PG_TAG_TO_READS=FALSE 2> {log}
         # Re sort the bam file using samtools
-        samtools sort -o {params.temp_sort_bam} {params.temp_bam}
+        samtools_threads="$(({threads} - 1))"
+        samtools sort -@ $samtools_threads -o {params.temp_sort_bam} {params.temp_bam} > /dev/null 2>&1
         # Index the bam file
         samtools index {params.temp_sort_bam}
         mv {params.temp_sort_bam} {output[0]}
         mv {params.temp_sort_bam}.bai {output[1]}
-        rm {params.temp_bam}
+        rm {params.temp_bam} {params.rehead_bam} {params.header}
         """
 
 ##### featurecounts #####

From 0ec5643838697e5b4498644a704aa60a59162fc5 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 6 Oct 2020 12:04:30 +0200
Subject: [PATCH 07/63] Fix path to eggnog-parser.py script

---
 workflow/rules/quantification.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 4f64dc5c..5bea583f 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -182,7 +182,7 @@ rule quantify_features:
         opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{fc_type}.tsv")
     shell:
         """
-        python source/utils/eggnog-parser.py \
+        python workflow/scripts/eggnog-parser.py \
             quantify {input.abund} {input.annot} {output[0]}
         """
 

From 4ec3083096be9cfdbb65b2c7117506919a69ec0e Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 7 Oct 2020 21:49:59 +0200
Subject: [PATCH 08/63] Add norm_method wildcard constraints

---
 workflow/rules/common.smk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index bf2cd2fb..f8e18459 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -35,7 +35,8 @@ wildcard_constraints:
     seq_type="[sp]e",
     binner="[a-z]+",
     group="\w+",
-    l="\d+"
+    l="\d+",
+    norm_method="(TMM|CSS|REL)"
 
 from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers
 

From 7c68a479f072a535c10608c4bc92ddfd8c2f60bf Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 7 Oct 2020 21:50:23 +0200
Subject: [PATCH 09/63] Add conda env for normalization

---
 workflow/envs/normalize.yml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 workflow/envs/normalize.yml

diff --git a/workflow/envs/normalize.yml b/workflow/envs/normalize.yml
new file mode 100644
index 00000000..273cdfa9
--- /dev/null
+++ b/workflow/envs/normalize.yml
@@ -0,0 +1,8 @@
+name: normalize
+channels:
+  - r
+  - bioconda
+  - defaults
+dependencies:
+  - bioconductor-edger=3.30.0
+  - bioconductor-metagenomeseq=1.30.0

From 905a8e23b3c81596422ff17a5c6488a4816c9b90 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 7 Oct 2020 21:50:47 +0200
Subject: [PATCH 10/63] Add script for normalization of counts

---
 workflow/scripts/normalize.R | 42 ++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 workflow/scripts/normalize.R

diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R
new file mode 100644
index 00000000..00f2624b
--- /dev/null
+++ b/workflow/scripts/normalize.R
@@ -0,0 +1,42 @@
+#!/usr/bin/env Rscript
+
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
+num_cols <- function(x) {
+    x <- x[, unlist(lapply(x, is.numeric))]
+    x
+}
+
+str_cols <- function(x) {
+    x <- x[, unlist(lapply(x, is.character))]
+    x
+}
+
+method <- snakemake@params$method
+
+# Read the counts
+x <- read.delim(snakemake@input[[1]], row.names = 1)
+# Remove unclassified
+x <- x[row.names(x)!="Unclassified", ]
+# Extract only numeric columns
+x_num <- num_cols(x)
+
+if (method %in% c("TMM", "RLE")) {
+    library(edgeR, quietly = TRUE)
+    # Create DGE
+    obj <- DGEList(x_num)
+    # Calculate norm factors
+    obj <- calcNormFactors(obj, method = method)
+    # Calculate cpms
+    norm <- cpm(obj, normalized.lib.sizes = TRUE)
+} else {
+    library(metagenomeSeq, quietly = TRUE)
+    obj <- newMRexperiment(x_num)
+    norm <- MRcounts(obj, norm = TRUE)
+}
+
+norm <- cbind(str_cols(x), norm)
+
+write.table(x = norm, file = snakemake@output[[1]], quote = FALSE, sep="\t")

From 3586286e6f4487d3d456472dd2aafd09504f5f06 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 7 Oct 2020 21:52:15 +0200
Subject: [PATCH 11/63] Initial add of normalize rule

---
 workflow/rules/quantification.smk | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 5bea583f..3ab4872a 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -186,6 +186,20 @@ rule quantify_features:
             quantify {input.abund} {input.annot} {output[0]}
         """
 
+rule normalize:
+    input:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.raw.tsv")
+    output:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{norm_method}.tsv")
+    log:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{norm_method}.log")
+    params:
+        method = "{norm_method}"
+    conda:
+        "../envs/normalize.yml"
+    script:
+        "../scripts/normalize.R"
+
 rule sum_to_taxa:
     input:
         tax=opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy",

From 644369aa2403e120d6c1cef242eb3b41a999ca44 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 13:00:33 +0200
Subject: [PATCH 12/63] Generalize featurecounts to one rule

---
 workflow/rules/quantification.smk | 45 ++++++++-----------------------
 1 file changed, 11 insertions(+), 34 deletions(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 5bea583f..425cc515 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -74,56 +74,33 @@ rule remove_mark_duplicates:
 
 ##### featurecounts #####
 
-rule featurecount_pe:
+rule featurecount:
     input:
         gff=opj(config["paths"]["results"], "annotation", "{assembly}",
                 "final_contigs.features.gff"),
         bam=opj(config["paths"]["results"], "assembly", "{assembly}",
-                "mapping", "{sample}_{unit}_pe"+POSTPROCESS+".bam")
+                "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam")
     output:
         opj(config["paths"]["results"], "assembly", "{assembly}", "mapping",
-            "{sample}_{unit}_pe.fc.tsv"),
+            "{sample}_{unit}_{seq_type}.fc.tsv"),
         opj(config["paths"]["results"], "assembly", "{assembly}", "mapping",
-            "{sample}_{unit}_pe.fc.tsv.summary")
+            "{sample}_{unit}_{seq_type}.fc.tsv.summary")
     log:
         opj(config["paths"]["results"], "assembly", "{assembly}",
-            "mapping", "{sample}_{unit}_pe.fc.log")
+            "mapping", "{sample}_{unit}_{seq_type}.fc.log")
     threads: 4
-    params: tmpdir=config["paths"]["temp"]
-    resources:
-        runtime=lambda wildcards, attempt: attempt**2*30
-    conda:
-        "../envs/quantify.yml"
-    shell:
-        """
-        featureCounts -a {input.gff} -o {output[0]} -t CDS -g gene_id -M -p \
-            -B -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1
-        """
-
-rule featurecount_se:
-    input:
-        gff=opj(config["paths"]["results"], "annotation", "{assembly}",
-                "final_contigs.features.gff"),
-        bam=opj(config["paths"]["results"], "assembly", "{assembly}",
-                "mapping", "{sample}_{unit}_se"+POSTPROCESS+".bam")
-    output:
-        opj(config["paths"]["results"], "assembly", "{assembly}",
-            "mapping", "{sample}_{unit}_se.fc.tsv"),
-        opj(config["paths"]["results"], "assembly", "{assembly}",
-            "mapping", "{sample}_{unit}_se.fc.tsv.summary")
-    log:
-        opj(config["paths"]["results"], "assembly", "{assembly}",
-            "mapping", "{sample}_{unit}_se.fc.log")
-    threads: 4
-    params: tmpdir=config["paths"]["temp"]
+    params:
+        tmpdir=config["paths"]["temp"],
+        setting=lambda wildcards: "-B -p" if wildcards.seq_type == "pe" else ""
     resources:
         runtime=lambda wildcards, attempt: attempt**2*30
     conda:
         "../envs/quantify.yml"
     shell:
         """
-        featureCounts -a {input.gff} -o {output[0]} -t CDS -g gene_id \
-            -M -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1
+        mkdir -p {params.tmpdir}
+        featureCounts -a {input.gff} -o {output[0]} -t CDS -g gene_id -M \
+            {params.setting} -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1
         """
 
 rule samtools_stats:

From 98b1e1fc12a71da16b54c37f067d2768549b7e2c Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 14:31:00 +0200
Subject: [PATCH 13/63] Add RPKM functionality to normalize script

---
 workflow/scripts/normalize.R | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R
index 00f2624b..f81b636c 100644
--- a/workflow/scripts/normalize.R
+++ b/workflow/scripts/normalize.R
@@ -24,15 +24,25 @@ x <- x[row.names(x)!="Unclassified", ]
 x_num <- num_cols(x)
 
 if (method %in% c("TMM", "RLE")) {
-    library(edgeR, quietly = TRUE)
+    library(edgeR)
     # Create DGE
     obj <- DGEList(x_num)
     # Calculate norm factors
     obj <- calcNormFactors(obj, method = method)
     # Calculate cpms
     norm <- cpm(obj, normalized.lib.sizes = TRUE)
+} else if (method == "RPKM") {
+    library(edgeR)
+    # Extract gene length column
+    gene_length <- x_num$Length
+    x_num <- x_num[, colnames(x_num) != "Length"]
+    obj <- DGEList(x_num)
+    # Calculate norm factors
+    obj <- calcNormFactors(obj, method = "TMM")
+    # Calculate RPKM
+    norm <- rpkm(obj, normalized.lib.sizes = TRUE, gene.length = gene_length)
 } else {
-    library(metagenomeSeq, quietly = TRUE)
+    library(metagenomeSeq)
     obj <- newMRexperiment(x_num)
     norm <- MRcounts(obj, norm = TRUE)
 }

From 25d15f715a553aa36c825833b675c729278ac5d8 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 14:37:36 +0200
Subject: [PATCH 14/63] Add RPKM rule

---
 workflow/rules/quantification.smk | 73 ++++++++++++++++---------------
 1 file changed, 38 insertions(+), 35 deletions(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 306e10d2..6485cd5b 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -2,7 +2,6 @@ localrules:
     quantify,
     write_featurefile,
     samtools_stats,
-    normalize_featurecount,
     aggregate_featurecount,
     sum_to_taxa,
     quantify_features,
@@ -72,6 +71,20 @@ rule remove_mark_duplicates:
         rm {params.temp_bam} {params.rehead_bam} {params.header}
         """
 
+rule samtools_stats:
+    input:
+        opj(config["paths"]["results"], "assembly", "{assembly}",
+                "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam")
+    output:
+        opj(config["paths"]["results"], "assembly", "{assembly}",
+                "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats")
+    conda:
+        "../envs/quantify.yml"
+    shell:
+        """
+        samtools stats {input} > {output}
+        """
+
 ##### featurecounts #####
 
 rule featurecount:
@@ -103,54 +116,44 @@ rule featurecount:
             {params.setting} -T {threads} --tmpDir {params.tmpdir} {input.bam} > {log} 2>&1
         """
 
-rule samtools_stats:
-    input:
-        opj(config["paths"]["results"], "assembly", "{assembly}",
-                "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam")
-    output:
-        opj(config["paths"]["results"], "assembly", "{assembly}",
-                "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats")
-    conda:
-        "../envs/quantify.yml"
-    shell:
-        """
-        samtools stats {input} > {output}
-        """
-
-rule normalize_featurecount:
+rule clean_featurecount:
     input:
         opj(config["paths"]["results"], "assembly", "{assembly}", "mapping",
-            "{sample}_{unit}_{seq_type}.fc.tsv"),
-        opj(config["paths"]["results"], "assembly", "{assembly}", "mapping",
-            "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats")
+            "{sample}_{unit}_{seq_type}.fc.tsv")
     output:
         opj(config["paths"]["results"], "assembly", "{assembly}", "mapping",
-            "{sample}_{unit}_{seq_type}.fc.tpm.tsv"),
-        opj(config["paths"]["results"], "assembly", "{assembly}", "mapping",
-            "{sample}_{unit}_{seq_type}.fc.raw.tsv")
-    log:
-        opj(config["paths"]["results"], "assembly", "{assembly}", "mapping",
-            "{sample}_{unit}_{seq_type}.fc.norm.log")
+            "{sample}_{unit}_{seq_type}.fc.clean.tsv")
     script:
         "../scripts/quantification_utils.py"
 
 rule aggregate_featurecount:
-    """Aggregates raw and normalized featureCounts files"""
+    """Aggregates all cleaned count files from featureCounts"""
     input:
-        raw_files=get_all_files(samples, opj(config["paths"]["results"],
+        get_all_files(samples, opj(config["paths"]["results"],
                                              "assembly", "{assembly}", "mapping"),
-                                ".fc.raw.tsv"),
-        tpm_files=get_all_files(samples, opj(config["paths"]["results"],
-                                             "assembly", "{assembly}", "mapping"),
-                                ".fc.tpm.tsv"),
-        gff_file=opj(config["paths"]["results"], "annotation", "{assembly}",
-                     "final_contigs.features.gff")
+                                ".fc.clean.tsv")
     output:
-        raw=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.raw.tsv"),
-        tpm=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.tpm.tsv")
+        opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv")
     script:
         "../scripts/quantification_utils.py"
 
+rule rpkm:
+    """
+    Calculate RPKM for genes in an assembly
+    """
+    input:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv")
+    output:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.tsv")
+    log:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.log")
+    params:
+        method = "RPKM"
+    conda:
+        "../envs/normalize.yml"
+    script:
+        "../scripts/normalize.R"
+
 rule quantify_features:
     input:
         abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv"),

From 9ed0db71f6ceae3dd523956490cb68f5550eac90 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 14:38:40 +0200
Subject: [PATCH 15/63] Remove samtools stats

---
 workflow/rules/quantification.smk | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 6485cd5b..4afa0a67 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -1,8 +1,8 @@
 localrules:
     quantify,
     write_featurefile,
-    samtools_stats,
     aggregate_featurecount,
+    clean_featurecount,
     sum_to_taxa,
     quantify_features,
     sum_to_rgi
@@ -71,20 +71,6 @@ rule remove_mark_duplicates:
         rm {params.temp_bam} {params.rehead_bam} {params.header}
         """
 
-rule samtools_stats:
-    input:
-        opj(config["paths"]["results"], "assembly", "{assembly}",
-                "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam")
-    output:
-        opj(config["paths"]["results"], "assembly", "{assembly}",
-                "mapping", "{sample}_{unit}_{seq_type}"+POSTPROCESS+".bam.stats")
-    conda:
-        "../envs/quantify.yml"
-    shell:
-        """
-        samtools stats {input} > {output}
-        """
-
 ##### featurecounts #####
 
 rule featurecount:

From 10c35596c04a7a9473593a37b271a13b6b0a1f6f Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 15:06:26 +0200
Subject: [PATCH 16/63] Perform check for unclassified column

---
 workflow/scripts/normalize.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R
index f81b636c..c8c763da 100644
--- a/workflow/scripts/normalize.R
+++ b/workflow/scripts/normalize.R
@@ -19,7 +19,9 @@ method <- snakemake@params$method
 # Read the counts
 x <- read.delim(snakemake@input[[1]], row.names = 1)
 # Remove unclassified
-x <- x[row.names(x)!="Unclassified", ]
+if ("Unclassified" %in% row.names(x)){
+    x <- x[row.names(x)!="Unclassified", ]
+}
 # Extract only numeric columns
 x_num <- num_cols(x)
 

From 667cba42f3b7f03b5ed1e09928c6bb55e17a4608 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 15:07:01 +0200
Subject: [PATCH 17/63] Drop TPM functionality

---
 workflow/scripts/quantification_utils.py | 169 +++++++++--------------
 1 file changed, 68 insertions(+), 101 deletions(-)

diff --git a/workflow/scripts/quantification_utils.py b/workflow/scripts/quantification_utils.py
index 8629fef0..1e780aaf 100644
--- a/workflow/scripts/quantification_utils.py
+++ b/workflow/scripts/quantification_utils.py
@@ -36,117 +36,82 @@ def write_featurefile(sm, score=".", group="gene_id", phase="."):
     return
 
 
-def calculate_tpm(df, readlength, fhlog):
-    """
-    Calculates transcripts per million normalized values for a sample based
-    on featureCounts output
-    :param df: pandas DataFrame as read from featureCounts output
-    :param readlength: average length of mapped reads
-    :param fhlog: loghandle
-    :return: pandas DataFrame with normalized values per gene
-    """
-    sampleName = df.columns[-1]
-    # 1. Calculate t for sample
-    # t = (reads_mapped_to_gene * read_length) / length_of_gene
-    # Multiply gene counts with read length,
-    # then divide by the Length column (in kb)
-    fhlog.write("Normalizing by read length and gene length\n")
-    t = df[sampleName].multiply(readlength).div(df["Length"].div(1000))
-    df = df.assign(t=pd.Series(t, index=df.index))
-    # 2. Calculate T
-    # T = sum(t)
-    fhlog.write("Calculating sum of normalized values\n")
-    T = df["t"].sum()
-    # 3. Calculate TPM
-    # TPM = t*10^6 / T
-    fhlog.write("Normalization factor T is {}\n".format(T))
-    fhlog.write("Calculating TPM\n")
-    TPM = (df["t"].multiply(1000000)).div(T)
-    df = df.assign(TPM=pd.Series(TPM, index=df.index))
-    return df
-
-
-def get_readlength(f):
-    """
-    Reads samtools flagstat file and extract average mapped read length
-    :param f: samtools flagstat file
-    :return:
-    """
-    with open(f, 'r') as fhin:
-        for line in fhin:
-            line = line.rstrip()
-            if line.startswith("SN") and "average length:" in line:
-                length = line.rsplit()[-1]
-    return float(length)
-
-
-def normalize_featurecount(sm):
-    """
-    Master rule for running normalization
-    :param sm: snakemake object
-    :return:
-    """
-    df = pd.read_csv(sm.input[0], skiprows=1, sep="\t")
+def clean_featurecount(sm):
+    """
+    This cleans the featureCounts output table from format:
+    # Program:featureCounts v2.0.0; Command:"featureCounts" "-a" "etc."
+    Geneid  Chr         Start   End     Strand  Length  path/to/bam/sample.bam
+    1_1     k141_7581   1       459     -       459     1
+    2_1     k141_0      469     714     +       246     2
+
+    To format:
+    gene_id         Length  sample
+    k141_7581_1     459     1
+    k141_0_1        246     1
+    """
+    df = pd.read_csv(sm.input[0], comment="#", sep="\t")
+    # Extract gene number and combine with contig id
+    df["gene_num"] = [x[1] for x in df.Geneid.str.split("_")]
+    df.set_index(df.Chr.map(str) + "_" + df.gene_num, inplace=True)
+    df.drop("gene_num", axis=1, inplace=True)
+    df.index.name = 'gene_id'
+    # Set sample and unit name from wildcards
     sample_unit = "{sample}_{unit}".format(sample=sm.wildcards.sample,
                                            unit=sm.wildcards.unit)
     df.columns = list(df.columns)[0:-1] + [sample_unit]
-    # Get average mapped read length
-    readlength = get_readlength(sm.input[1])
-
-    # Perform normalization
-    with open(sm.log[0], 'w') as fhlog:
-        df = calculate_tpm(df, readlength, fhlog)
-
-    df_tpm = df.iloc[:, [0, -1]]
-    df_tpm.columns = ["gene_id", sample_unit]
-    df_tpm.to_csv(sm.output[0], sep="\t", index=False)
-
-    df_raw = df.iloc[:, [0, 6]]
-    df_raw.columns = ["gene_id", sample_unit]
-    df_raw.to_csv(sm.output[1], sep="\t", index=False)
-
-
-def merge_files(files, gff_df):
-    """
-    Merges abundance tables from several samples
-    :param files: list of files
-    :param gff_df: pandas DataFrame with orf ids in the format <contig>_<orfnum>
-    :return: merged pandas DataFrame
-    """
-    df = pd.DataFrame()
-    for f in files:
-        _df = pd.read_csv(f, index_col=0, sep="\t")
-        df = pd.concat([df, _df], axis=1)
-    df = pd.merge(df, gff_df, left_index=True, right_on="gene_id")
-    df.drop("gene_id", axis=1, inplace=True)
-    df.set_index("orf", inplace=True)
-    return df
+    # Extract length and counts
+    df = df.loc[:, ["Length", sample_unit]]
+    df.to_csv(sm.output[0], sep="\t")
 
 
 def aggregate_featurecount(sm):
     """
-    Aggregates normalized and raw tables per sample into one table per assembly
+    Aggregates cleaned featureCounts tables into one table per assembly
 
     :param sm: snakemake object
     :return:
     """
-    gff_df = pd.read_csv(sm.input.gff_file, header=None, usecols=[0, 8],
-                         names=["contig", "gene"], sep="\t")
-    gff_df = gff_df.assign(
-        gene_id=pd.Series([x.replace("gene_id ", "") for x in gff_df.gene],
-                          index=gff_df.index))
-    gff_df = gff_df.assign(
-        suffix=pd.Series([x.split(" ")[-1].split("_")[-1] for x in gff_df.gene],
-                         index=gff_df.index))
-    gff_df = gff_df.assign(
-        orf=pd.Series(gff_df.contig + "_" + gff_df.suffix, index=gff_df.index))
-    gff_df = gff_df[["orf", "gene_id"]]
-
-    raw_df = merge_files(sm.input.raw_files, gff_df)
-    tpm_df = merge_files(sm.input.tpm_files, gff_df)
-    raw_df.to_csv(sm.output.raw, sep="\t")
-    tpm_df.to_csv(sm.output.tpm, sep="\t")
-
+    df = pd.DataFrame()
+    lmap = {}
+    for f in sm.input:
+        _df = pd.read_csv(f, sep="\t", index_col=0)
+        lmap.update(_df.to_dict()["Length"])
+        _df.drop("Length", axis=1, inplace=True)
+        df = pd.merge(df, _df, right_index=True, left_index=True, how="outer")
+    counts = pd.merge(df, pd.DataFrame(lmap, index=["Length"]).T,
+                      left_index=True, right_index=True)
+    counts.to_csv(sm.output[0], sep="\t")
+
+
+def process_and_sum(q_df, annot_df):
+    # Merge annotations and abundance
+    # keep ORFs without annotation as "Unclassified"
+    annot_q_df = pd.merge(annot_df, q_df, left_index=True, right_index=True,
+                          how="right")
+    annot_q_df.fillna("Unclassified", inplace=True)
+    feature_cols = annot_df.columns
+    annot_q_sum = annot_q_df.groupby(list(feature_cols)).sum().reset_index()
+    annot_q_sum.set_index(feature_cols[0], inplace=True)
+    return annot_q_sum
+
+
+def sum_to_features(abundance, parsed):
+    parsed_df = pd.read_csv(parsed, index_col=0, sep="\t")
+    abundance_df = pd.read_csv(abundance, index_col=0, sep="\t")
+    abundance_df.drop("Length", axis=1, inplace=True, errors="ignore")
+    feature_sum = process_and_sum(abundance_df, parsed_df)
+    return feature_sum
+
+
+def count_features(sm):
+    """
+    Counts reads mapped to features such as KOs, PFAMs etc.
+
+    :param sm:
+    :return:
+    """
+    feature_sum = sum_to_features(sm.input.abundance, sm.input.parsed)
+    feature_sum.to_csv(sm.output[0], sep="\t")
 
 def sum_to_taxa(sm):
     """
@@ -184,10 +149,12 @@ def sum_to_rgi(sm):
     dfsum = df.groupby("AMR Gene Family").sum()
     dfsum.to_csv(sm.output[0], sep="\t", index=True, header=True)
 
+
 def main(sm):
     toolbox = {"write_featurefile": write_featurefile,
-               "normalize_featurecount": normalize_featurecount,
+               "clean_featurecount": clean_featurecount,
                "aggregate_featurecount": aggregate_featurecount,
+               "count_features": count_features,
                "sum_to_taxa": sum_to_taxa,
                "sum_to_rgi": sum_to_rgi}
 

From 3520bce43e9b2025b254fcaf810482c1ad636676 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 15:08:16 +0200
Subject: [PATCH 18/63] Clean up quantification with standard normalizations

This commit removes support for TPM normalization and attempts to clean up how genes and features are quantified and normalized
---
 workflow/rules/quantification.smk | 44 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 4afa0a67..c494b6f2 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -3,8 +3,9 @@ localrules:
     write_featurefile,
     aggregate_featurecount,
     clean_featurecount,
+    count_features,
+    normalize_features,
     sum_to_taxa,
-    quantify_features,
     sum_to_rgi
 
 ##### quantify master rule #####
@@ -12,8 +13,8 @@ localrules:
 rule quantify:
     input:
         expand(opj(config["paths"]["results"], "annotation", "{assembly}",
-                   "fc.{fc_type}.tsv"),
-               assembly=assemblies.keys(), fc_type=["tpm", "raw"])
+                   "gene_{counts_type}.tsv"),
+               assembly=assemblies.keys(), counts_type=["counts", "rpkm"])
 
 
 rule write_featurefile:
@@ -115,11 +116,11 @@ rule clean_featurecount:
 rule aggregate_featurecount:
     """Aggregates all cleaned count files from featureCounts"""
     input:
-        get_all_files(samples, opj(config["paths"]["results"],
-                                             "assembly", "{assembly}", "mapping"),
-                                ".fc.clean.tsv")
+        get_all_files(samples=samples,
+                      dir=opj(config["paths"]["results"],"assembly", "{assembly}", "mapping"),
+                      suffix=".fc.clean.tsv")
     output:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv")
+        opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv")
     script:
         "../scripts/quantification_utils.py"
 
@@ -128,9 +129,9 @@ rule rpkm:
     Calculate RPKM for genes in an assembly
     """
     input:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "counts.tsv")
+        opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv")
     output:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.tsv")
+        opj(config["paths"]["results"], "annotation", "{assembly}", "gene_rpkm.tsv")
     log:
         opj(config["paths"]["results"], "annotation", "{assembly}", "rpkm.log")
     params:
@@ -140,21 +141,18 @@ rule rpkm:
     script:
         "../scripts/normalize.R"
 
-rule quantify_features:
+rule count_features:
     input:
-        abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv"),
+        abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv"),
         annot=opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.tsv")
     output:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{fc_type}.tsv")
-    shell:
-        """
-        python workflow/scripts/eggnog-parser.py \
-            quantify {input.abund} {input.annot} {output[0]}
-        """
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv")
+    script:
+        "../scripts/quantification_utils.py"
 
-rule normalize:
+rule normalize_features:
     input:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.raw.tsv")
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv")
     output:
         opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.{norm_method}.tsv")
     log:
@@ -170,18 +168,18 @@ rule sum_to_taxa:
     input:
         tax=opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy",
             "orfs.{db}.taxonomy.tsv".format(db=config["taxonomy"]["database"])),
-        abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv")
+        abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_{counts_type}.tsv")
     output:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "tax.{fc_type}.tsv")
+        opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "tax.{counts_type}.tsv")
     script:
         "../scripts/quantification_utils.py"
 
 rule sum_to_rgi:
     input:
         annot=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out.txt"),
-        abund=opj(config["paths"]["results"], "annotation", "{assembly}", "fc.{fc_type}.tsv")
+        abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_{counts_type}.tsv")
     output:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.{fc_type}.tsv")
+        opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.{counts_type}.tsv")
     script:
         "../scripts/quantification_utils.py"
 

From f52228aeb603fdfe35c5463de8fad7e24500c0bc Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 15:08:37 +0200
Subject: [PATCH 19/63] Add wildcard constraints for counts_type

---
 workflow/rules/common.smk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index f8e18459..6c9f52cc 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -36,6 +36,7 @@ wildcard_constraints:
     binner="[a-z]+",
     group="\w+",
     l="\d+",
+    counts_type="(counts|rpkm)",
     norm_method="(TMM|CSS|REL)"
 
 from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers

From cae1c920778285d60cbc95eced4cd38cd9fb4cd1 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 8 Oct 2020 15:13:52 +0200
Subject: [PATCH 20/63] Update quantified feature output

---
 workflow/scripts/common.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index a79b5709..d8848a0f 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -611,22 +611,25 @@ def annotation_input(config, assemblies):
         # Add EGGNOG annotation
         if config["annotation"]["eggnog"]:
             input += expand(opj(config["paths"]["results"], "annotation", group,
-                                "{db}.parsed.{fc}.tsv"),
+                                "{db}.parsed.{norm_method}.tsv"),
                             db=["enzymes", "pathways", "kos", "modules"],
-                            fc=["raw", "tpm"])
+                            norm_method=["counts", "TMM", "REL", "CSS"])
         # Add PFAM annotation
         if config["annotation"]["pfam"]:
             input += expand(opj(config["paths"]["results"], "annotation", group,
-                                "pfam.parsed.{fc}.tsv"), fc=["tpm", "raw"])
+                                "pfam.parsed.{norm_method}.tsv"),
+                            norm_method=["counts", "TMM", "REL", "CSS"])
         # Add taxonomic annotation
         if config["annotation"]["taxonomy"]:
             input += expand(
                 opj(config["paths"]["results"], "annotation", group, "taxonomy",
-                    "tax.{fc}.tsv"), fc=["tpm", "raw"])
+                    "tax.{counts_type}.tsv"),
+                counts_type=["counts", "rpkm"])
         # Add Resistance Gene Identifier output
         if config["annotation"]["rgi"]:
             input += expand(opj(config["paths"]["results"], "annotation", group,
-                                "rgi.{fc}.tsv"), fc=["raw", "tpm"])
+                                "rgi.{norm_method}.tsv"),
+                            norm_method=["counts", "TMM", "REL", "CSS"])
             input.append(opj(config["paths"]["results"], "annotation", group,
                              "rgi.out.txt"))
     return input

From 40b57fb04671395c6946bbe32a61d6d75887f6d6 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 11:23:25 +0200
Subject: [PATCH 21/63] Remove length column when quantifying taxa

---
 workflow/scripts/quantification_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflow/scripts/quantification_utils.py b/workflow/scripts/quantification_utils.py
index 1e780aaf..21bf9797 100644
--- a/workflow/scripts/quantification_utils.py
+++ b/workflow/scripts/quantification_utils.py
@@ -126,6 +126,8 @@ def sum_to_taxa(sm):
     df = pd.read_csv(sm.input.tax, sep="\t", index_col=0, header=None,
                      names=header)
     abund_df = pd.read_csv(sm.input.abund, header=0, index_col=0, sep="\t")
+    # Remove length column
+    abund_df.drop("Length", axis=1, inplace=True, errors="ignore")
     taxa_abund = pd.merge(df, abund_df, right_index=True, left_index=True)
     taxa_abund_sum = taxa_abund.groupby(header[1:]).sum().reset_index()
     taxa_abund_sum.to_csv(sm.output[0], sep="\t", index=False)

From 465ba135ba3c744f03ece2060ca25b058d40c9ee Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 11:23:54 +0200
Subject: [PATCH 22/63] Normalize counts after summing for RGI

---
 workflow/scripts/common.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index d8848a0f..0e9b06b2 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -628,10 +628,8 @@ def annotation_input(config, assemblies):
         # Add Resistance Gene Identifier output
         if config["annotation"]["rgi"]:
             input += expand(opj(config["paths"]["results"], "annotation", group,
-                                "rgi.{norm_method}.tsv"),
+                                "rgi.parsed.{norm_method}.tsv"),
                             norm_method=["counts", "TMM", "REL", "CSS"])
-            input.append(opj(config["paths"]["results"], "annotation", group,
-                             "rgi.out.txt"))
     return input
 
 

From 1ee63292796dcfd2004048f6e46eda01f59d7788 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 11:27:42 +0200
Subject: [PATCH 23/63] Update RGI version

---
 workflow/envs/rgi.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/rgi.yml b/workflow/envs/rgi.yml
index 85cf0673..b0657038 100644
--- a/workflow/envs/rgi.yml
+++ b/workflow/envs/rgi.yml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - rgi=4.0.3
+  - rgi=5.1.1

From cc2aea658d8a7ce6f419b124399b49a5c520b8f8 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 13:42:02 +0200
Subject: [PATCH 24/63] Update snakemake input for summing to features

---
 workflow/scripts/quantification_utils.py | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/workflow/scripts/quantification_utils.py b/workflow/scripts/quantification_utils.py
index 21bf9797..deddaf3b 100644
--- a/workflow/scripts/quantification_utils.py
+++ b/workflow/scripts/quantification_utils.py
@@ -110,7 +110,7 @@ def count_features(sm):
     :param sm:
     :return:
     """
-    feature_sum = sum_to_features(sm.input.abundance, sm.input.parsed)
+    feature_sum = sum_to_features(sm.input.abund, sm.input.annot)
     feature_sum.to_csv(sm.output[0], sep="\t")
 
 def sum_to_taxa(sm):
@@ -133,32 +133,12 @@ def sum_to_taxa(sm):
     taxa_abund_sum.to_csv(sm.output[0], sep="\t", index=False)
 
 
-def sum_to_rgi(sm):
-    """
-    Takes Resistance gene identifier output and abundance info per orf and sums
-    values to gene family level
-
-    :param sm: snakemake object
-    :return:
-    """
-    annot = pd.read_csv(sm.input.annot, sep="\t", header=0, index_col=0,
-                        usecols=[0, 16])
-    # Rename index for annotations to remove text after whitespace
-    annot.rename(index=lambda x: x.split(" ")[0], inplace=True)
-    abund = pd.read_csv(sm.input.abund, sep="\t", header=0, index_col=0)
-    df = pd.merge(annot, abund, left_index=True, right_index=True)
-    # Sum to Gene family
-    dfsum = df.groupby("AMR Gene Family").sum()
-    dfsum.to_csv(sm.output[0], sep="\t", index=True, header=True)
-
-
 def main(sm):
     toolbox = {"write_featurefile": write_featurefile,
                "clean_featurecount": clean_featurecount,
                "aggregate_featurecount": aggregate_featurecount,
                "count_features": count_features,
-               "sum_to_taxa": sum_to_taxa,
-               "sum_to_rgi": sum_to_rgi}
+               "sum_to_taxa": sum_to_taxa}
 
     toolbox[sm.rule](sm)
 

From 42ea0e67e4b9ab9e549c16ad9dc4e424a9a1e879 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 13:42:25 +0200
Subject: [PATCH 25/63] Update RGI rule to match version change

---
 workflow/rules/annotation.smk | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index fd60c200..883a9961 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -411,6 +411,7 @@ rule rgi:
     params:
         out=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out"),
         settings="-a diamond --local --clean --input_type protein"
+    shadow: "minimal"
     conda:
         "../envs/rgi.yml"
     threads: 10
@@ -418,7 +419,15 @@ rule rgi:
         runtime=lambda wildcards, attempt: attempt**2*60
     shell:
         """
-        rgi load --card_json {input.db} --local > {log} 2>&1
+        rgi load -i {input.db} --local > {log} 2>&1
         rgi main -i {input.faa} -o {params.out} \
             -n {threads} {params.settings} >>{log} 2>>{log}
         """
+
+rule parse_rgi:
+    input:
+        txt=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out.txt")
+    output:
+        tsv=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.parsed.tsv")
+    script:
+        "../scripts/annotation_utils.py"

From f4fd582fff64f22902a0b542c437da80878968d5 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 13:45:06 +0200
Subject: [PATCH 26/63] Add function to parse RGI output similarly to PFAM
 output

---
 workflow/scripts/annotation_utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/annotation_utils.py b/workflow/scripts/annotation_utils.py
index 09bf745f..b878252c 100644
--- a/workflow/scripts/annotation_utils.py
+++ b/workflow/scripts/annotation_utils.py
@@ -3,6 +3,13 @@
 import pandas as pd
 
 
+def parse_rgi(sm):
+    annot = pd.read_csv(sm.input.txt, sep="\t", index_col=0)
+    annot = annot.loc[:, ["AMR Gene Family", "Resistance Mechanism"]]
+    annot.rename(index=lambda x: x.split(" ")[0], inplace=True)
+    annot.to_csv(sm.output.tsv, sep="\t", index=True)
+
+
 def parse_pfam(sm):
     annot = pd.read_csv(sm.input[0], comment="#", header=None, sep=" +",
                         usecols=[0, 5, 7, 14], engine="python",
@@ -28,7 +35,8 @@ def parse_pfam(sm):
 
 
 def main(sm):
-    toolbox = {"parse_pfam": parse_pfam}
+    toolbox = {"parse_pfam": parse_pfam,
+               "parse_rgi": parse_rgi}
     toolbox[sm.rule](sm)
 
 

From 070cda4e497b55688126afd4d9c397e4e85c10a1 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 13:51:21 +0200
Subject: [PATCH 27/63] Remove specific rule for summing RGI

---
 workflow/rules/quantification.smk | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index c494b6f2..b5dbd44f 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -142,6 +142,9 @@ rule rpkm:
         "../scripts/normalize.R"
 
 rule count_features:
+    """
+    Sums read counts for gene annotation features such as pfam, KOs etc.
+    """
     input:
         abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_counts.tsv"),
         annot=opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.tsv")
@@ -151,6 +154,9 @@ rule count_features:
         "../scripts/quantification_utils.py"
 
 rule normalize_features:
+    """
+    Normalizes counts of features using TMM, REL or CSS
+    """
     input:
         opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv")
     output:
@@ -165,6 +171,9 @@ rule normalize_features:
         "../scripts/normalize.R"
 
 rule sum_to_taxa:
+    """
+    Sums read counts and RPKM values for genes to assigned taxonomy
+    """
     input:
         tax=opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy",
             "orfs.{db}.taxonomy.tsv".format(db=config["taxonomy"]["database"])),
@@ -173,13 +182,3 @@ rule sum_to_taxa:
         opj(config["paths"]["results"], "annotation", "{assembly}", "taxonomy", "tax.{counts_type}.tsv")
     script:
         "../scripts/quantification_utils.py"
-
-rule sum_to_rgi:
-    input:
-        annot=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out.txt"),
-        abund=opj(config["paths"]["results"], "annotation", "{assembly}", "gene_{counts_type}.tsv")
-    output:
-        opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.{counts_type}.tsv")
-    script:
-        "../scripts/quantification_utils.py"
-

From 70f4f64fe04044572b6b3af5f9dffe6fbe046fac Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 21:36:27 +0200
Subject: [PATCH 28/63] Fix spelling of RLE method

---
 workflow/rules/common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 6c9f52cc..5ec04ede 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -37,7 +37,7 @@ wildcard_constraints:
     group="\w+",
     l="\d+",
     counts_type="(counts|rpkm)",
-    norm_method="(TMM|CSS|REL)"
+    norm_method="(TMM|CSS|RLE)"
 
 from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers
 

From 53e9ada8a2a7cb53642b362be0b4ab3f0149e80a Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 21:37:33 +0200
Subject: [PATCH 29/63] Fix spelling of RLE method

---
 workflow/scripts/common.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index 0e9b06b2..378c6350 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -613,12 +613,12 @@ def annotation_input(config, assemblies):
             input += expand(opj(config["paths"]["results"], "annotation", group,
                                 "{db}.parsed.{norm_method}.tsv"),
                             db=["enzymes", "pathways", "kos", "modules"],
-                            norm_method=["counts", "TMM", "REL", "CSS"])
+                            norm_method=["counts", "TMM", "RLE", "CSS"])
         # Add PFAM annotation
         if config["annotation"]["pfam"]:
             input += expand(opj(config["paths"]["results"], "annotation", group,
                                 "pfam.parsed.{norm_method}.tsv"),
-                            norm_method=["counts", "TMM", "REL", "CSS"])
+                            norm_method=["counts", "TMM", "RLE", "CSS"])
         # Add taxonomic annotation
         if config["annotation"]["taxonomy"]:
             input += expand(
@@ -629,7 +629,7 @@ def annotation_input(config, assemblies):
         if config["annotation"]["rgi"]:
             input += expand(opj(config["paths"]["results"], "annotation", group,
                                 "rgi.parsed.{norm_method}.tsv"),
-                            norm_method=["counts", "TMM", "REL", "CSS"])
+                            norm_method=["counts", "TMM", "RLE", "CSS"])
     return input
 
 

From dc7b801b92709c5e0ad98d67365946e00d3b5bbb Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 21:37:53 +0200
Subject: [PATCH 30/63] Include unique CARD model id

---
 workflow/scripts/annotation_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/annotation_utils.py b/workflow/scripts/annotation_utils.py
index b878252c..657957bf 100644
--- a/workflow/scripts/annotation_utils.py
+++ b/workflow/scripts/annotation_utils.py
@@ -5,7 +5,8 @@
 
 def parse_rgi(sm):
     annot = pd.read_csv(sm.input.txt, sep="\t", index_col=0)
-    annot = annot.loc[:, ["AMR Gene Family", "Resistance Mechanism"]]
+    annot = annot.loc[:, ["Model_ID", "AMR Gene Family", "Resistance Mechanism"]]
+    annot.loc[:, "Model_ID"] = ["RGI_{}".format(x) for x in annot.Model_ID]
     annot.rename(index=lambda x: x.split(" ")[0], inplace=True)
     annot.to_csv(sm.output.tsv, sep="\t", index=True)
 

From 5b099b10ef61f1daa49c83882845ad3e3cdfa5f2 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 21:52:39 +0200
Subject: [PATCH 31/63] Add minimum mapping qual for featureCounts

---
 workflow/rules/quantification.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index b5dbd44f..7cd9441f 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -91,7 +91,7 @@ rule featurecount:
     threads: 4
     params:
         tmpdir=config["paths"]["temp"],
-        setting=lambda wildcards: "-B -p" if wildcards.seq_type == "pe" else ""
+        setting=lambda wildcards: "-Q 10 -B -p" if wildcards.seq_type == "pe" else ""
     resources:
         runtime=lambda wildcards, attempt: attempt**2*30
     conda:

From 69d102addaf2fb944a16a1345a1bd605adfdf8b2 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 22:30:18 +0200
Subject: [PATCH 32/63] Handle cases where samples have only 1 feature

---
 workflow/scripts/normalize.R | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R
index c8c763da..66bdb489 100644
--- a/workflow/scripts/normalize.R
+++ b/workflow/scripts/normalize.R
@@ -17,11 +17,16 @@ str_cols <- function(x) {
 method <- snakemake@params$method
 
 # Read the counts
-x <- read.delim(snakemake@input[[1]], row.names = 1)
+x <- read.delim(snakemake@input[[1]], row.names = 1, sep = "\t", header = TRUE)
 # Remove unclassified
 if ("Unclassified" %in% row.names(x)){
     x <- x[row.names(x)!="Unclassified", ]
 }
+if (nrow(x)==0) {
+    write.table(x, file = snakemake@output[[1]], quote = FALSE, sep="\t")
+    quit()
+}
+
 # Extract only numeric columns
 x_num <- num_cols(x)
 
@@ -46,6 +51,16 @@ if (method %in% c("TMM", "RLE")) {
 } else {
     library(metagenomeSeq)
     obj <- newMRexperiment(x_num)
+    smat = lapply(1:ncol(x_num), function(i) {
+	    sort(x_num[which(x_num[, i]>0),i], decreasing = TRUE)
+	})
+    if(any(sapply(smat,length)==1)) {
+        fh <-file(snakemake@output[[1]])
+        writeLines(c("WARNING: Sample with one or zero features",
+                     "Cumulative Sum Scaling failed for sample"), fh)
+        close(fh)
+        quit()
+    }
     norm <- MRcounts(obj, norm = TRUE)
 }
 

From 05c949b2e26dcf282bdcf11770d648e3c9a5d07c Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 22:31:11 +0200
Subject: [PATCH 33/63] Add testing for RGI

---
 .github/workflows/main.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 14b2bdde..c97a0034 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -49,10 +49,13 @@ jobs:
     - name: Kraken
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport
-    - name: Metaspades
+    - name: Metaspades + RGI
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p assemble
         rm -r results/assembly results/report examples/data/sample*
+    - name: RGI
+      run: |
+        snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate
     - name: Prepare taxonomy
       run: bash .test/scripts/prep_taxonomy.sh
     - name: Taxonomy

From 362762bfcafd29880727e251f9ee218a2fc75231 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Mon, 12 Oct 2020 22:31:37 +0200
Subject: [PATCH 34/63] Update metaspades test config

---
 .test/config/metaspades.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.test/config/metaspades.yaml b/.test/config/metaspades.yaml
index 96b5ad14..e619bec8 100644
--- a/.test/config/metaspades.yaml
+++ b/.test/config/metaspades.yaml
@@ -117,13 +117,13 @@ annotation:
   # run tRNAscan-SE?
   tRNAscan: False
   # run infernal for rRNA identification?
-  infernal: True
+  infernal: False
   # run eggnog-mapper to infer KEGG orthologs, pathways and modules?
   eggnog: False
   # run PFAM-scan to infer protein families from PFAM?
-  pfam: True
+  pfam: False
   # run Resistance gene identifier?
-  rgi: False
+  rgi: True
   # run taxonomic annotation of assembled contigs (using tango + sourmash)?
   taxonomy: False
 

From 819643a392dc9d8f5a71cef6b96852490a60c683 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 08:11:08 +0200
Subject: [PATCH 35/63] Update localrules

---
 workflow/rules/quantification.smk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 7cd9441f..95bc40f8 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -1,12 +1,12 @@
 localrules:
     quantify,
     write_featurefile,
-    aggregate_featurecount,
     clean_featurecount,
+    aggregate_featurecount,
+    rpkm,
     count_features,
     normalize_features,
-    sum_to_taxa,
-    sum_to_rgi
+    sum_to_taxa
 
 ##### quantify master rule #####
 

From 0a02f260a1e77662f9810d773255575021910efe Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 08:19:18 +0200
Subject: [PATCH 36/63] Update localrules

---
 workflow/rules/annotation.smk | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index 883a9961..1a3c9e3e 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -2,16 +2,17 @@ from scripts.common import annotation_input
 
 localrules: 
     annotate,
-    parse_ko_annotations, 
-    parse_pfam,
     download_rfams,
     press_rfams,
     download_pfam,
-    press_pfam,
     download_pfam_info,
+    press_pfam,
+    parse_pfam,
     download_eggnog,
     get_kegg_info,
-    download_rgi_data
+    parse_ko_annotations,
+    download_rgi_data,
+    parse_rgi
 
 ##### annotation master rule #####
 

From db2405f3b29d2d9de33434567247ea4db439f20f Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 08:22:38 +0200
Subject: [PATCH 37/63] Update localrules

---
 workflow/rules/assembly.smk | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk
index 6246192f..35890f2a 100644
--- a/workflow/rules/assembly.smk
+++ b/workflow/rules/assembly.smk
@@ -16,6 +16,8 @@ rule assemble:
         opj(config["paths"]["results"], "report", "assembly", "alignment_frequency.pdf")
 
 if config["assembly"]["metaspades"]:
+    localrules:
+        generate_metaspades_input
     rule generate_metaspades_input:
         """Generate input files for use with Metaspades"""
         input:
@@ -97,6 +99,8 @@ if config["assembly"]["metaspades"]:
             """
 
 else:
+    localrules:
+        generate_megahit_input
     rule generate_megahit_input:
         """Generate input lists for Megahit"""
         input:

From 66758a64ca3a56de63dea1e948113a24ae8072c5 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 08:33:56 +0200
Subject: [PATCH 38/63] Update localrules

---
 workflow/rules/binning.smk | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk
index 232f1be1..c341f678 100644
--- a/workflow/rules/binning.smk
+++ b/workflow/rules/binning.smk
@@ -22,7 +22,6 @@ localrules:
     cluster_genomes,
     count_rRNA,
     count_tRNA,
-    aggregate_gtdbtk,
     aggregate_bin_annot,
     binning_report
 

From 960b2c85182808da205bbd561a6a888d54a04ef4 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 08:42:55 +0200
Subject: [PATCH 39/63] Change channel order

---
 workflow/envs/normalize.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/normalize.yml b/workflow/envs/normalize.yml
index 273cdfa9..6b0d5133 100644
--- a/workflow/envs/normalize.yml
+++ b/workflow/envs/normalize.yml
@@ -1,7 +1,7 @@
 name: normalize
 channels:
-  - r
   - bioconda
+  - r
   - defaults
 dependencies:
   - bioconductor-edger=3.30.0

From 11d01e395d6bbc74f0e38ccb7484d5aac3f9a554 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 09:23:01 +0200
Subject: [PATCH 40/63] Clear versions for normalize.yml

---
 workflow/envs/normalize.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/workflow/envs/normalize.yml b/workflow/envs/normalize.yml
index 6b0d5133..f60900c0 100644
--- a/workflow/envs/normalize.yml
+++ b/workflow/envs/normalize.yml
@@ -1,8 +1,7 @@
 name: normalize
 channels:
   - bioconda
-  - r
   - defaults
 dependencies:
-  - bioconductor-edger=3.30.0
-  - bioconductor-metagenomeseq=1.30.0
+  - bioconductor-edger
+  - bioconductor-metagenomeseq

From 98e219649bacaa0c666aaf2bc926b3247b6f45e9 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:17:56 +0200
Subject: [PATCH 41/63] Formatting fixes after running pre-commit

---
 workflow/envs/annotation.yml   | 2 +-
 workflow/envs/barrnap.yml      | 2 +-
 workflow/envs/centrifuge.yml   | 2 +-
 workflow/envs/checkm.yml       | 2 +-
 workflow/envs/concoct.yml      | 2 +-
 workflow/envs/cookiecutter.yml | 2 +-
 workflow/envs/graphlan.yml     | 2 +-
 workflow/envs/kraken.yml       | 2 +-
 workflow/envs/krona.yml        | 2 +-
 workflow/envs/maxbin.yml       | 2 +-
 workflow/envs/megahit.yml      | 2 +-
 workflow/envs/metabat.yml      | 2 +-
 workflow/envs/metaphlan.yml    | 2 +-
 workflow/envs/metaspades.yml   | 2 +-
 workflow/envs/plotting.yml     | 2 +-
 workflow/envs/preprocess.yml   | 2 +-
 workflow/envs/quantify.yml     | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/workflow/envs/annotation.yml b/workflow/envs/annotation.yml
index f3daf0a7..924d0138 100644
--- a/workflow/envs/annotation.yml
+++ b/workflow/envs/annotation.yml
@@ -8,4 +8,4 @@ dependencies:
   - pfam_scan=1.6
   - eggnog-mapper=2.0.1
   - infernal=1.1.2
-  - trnascan-se=2.0.5
\ No newline at end of file
+  - trnascan-se=2.0.5
diff --git a/workflow/envs/barrnap.yml b/workflow/envs/barrnap.yml
index 53ae4d61..ff933f29 100644
--- a/workflow/envs/barrnap.yml
+++ b/workflow/envs/barrnap.yml
@@ -4,4 +4,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - barrnap=0.9
\ No newline at end of file
+  - barrnap=0.9
diff --git a/workflow/envs/centrifuge.yml b/workflow/envs/centrifuge.yml
index d4579e74..af69b00d 100644
--- a/workflow/envs/centrifuge.yml
+++ b/workflow/envs/centrifuge.yml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - centrifuge=1.0.4_beta
\ No newline at end of file
+  - centrifuge=1.0.4_beta
diff --git a/workflow/envs/checkm.yml b/workflow/envs/checkm.yml
index 35cc6929..d0488e59 100644
--- a/workflow/envs/checkm.yml
+++ b/workflow/envs/checkm.yml
@@ -5,4 +5,4 @@ channels:
   - defaults
 dependencies:
   - python=3.7.6
-  - checkm-genome=1.1.2
\ No newline at end of file
+  - checkm-genome=1.1.2
diff --git a/workflow/envs/concoct.yml b/workflow/envs/concoct.yml
index f5f23857..b3a12405 100644
--- a/workflow/envs/concoct.yml
+++ b/workflow/envs/concoct.yml
@@ -5,4 +5,4 @@ channels:
 dependencies:
   - python=3.7.6
   - biopython=1.76
-  - concoct=1.1.0
\ No newline at end of file
+  - concoct=1.1.0
diff --git a/workflow/envs/cookiecutter.yml b/workflow/envs/cookiecutter.yml
index 35633e58..5ea2b60a 100644
--- a/workflow/envs/cookiecutter.yml
+++ b/workflow/envs/cookiecutter.yml
@@ -2,4 +2,4 @@ name: cookiecutter
 channels:
   - conda-forge
 dependencies:
-  - cookiecutter
\ No newline at end of file
+  - cookiecutter
diff --git a/workflow/envs/graphlan.yml b/workflow/envs/graphlan.yml
index 68c09ae9..6bb097c6 100644
--- a/workflow/envs/graphlan.yml
+++ b/workflow/envs/graphlan.yml
@@ -2,4 +2,4 @@ channels:
   - bioconda
 dependencies:
   - graphlan=1.1.3
-  - export2graphlan=0.20
\ No newline at end of file
+  - export2graphlan=0.20
diff --git a/workflow/envs/kraken.yml b/workflow/envs/kraken.yml
index f582d7e7..a8e77cb4 100644
--- a/workflow/envs/kraken.yml
+++ b/workflow/envs/kraken.yml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - kraken2=2.0.8_beta
\ No newline at end of file
+  - kraken2=2.0.8_beta
diff --git a/workflow/envs/krona.yml b/workflow/envs/krona.yml
index eb0bfce6..1596854c 100644
--- a/workflow/envs/krona.yml
+++ b/workflow/envs/krona.yml
@@ -3,4 +3,4 @@ channels:
   - defaults
 dependencies:
   - krona=2.7.1
-  - make=4.2.1
\ No newline at end of file
+  - make=4.2.1
diff --git a/workflow/envs/maxbin.yml b/workflow/envs/maxbin.yml
index 46fac4a1..5c9396c8 100644
--- a/workflow/envs/maxbin.yml
+++ b/workflow/envs/maxbin.yml
@@ -2,4 +2,4 @@ channels:
    - bioconda
 dependencies:
   - maxbin2=2.2.7
-  - perl-lwp-simple=6.15
\ No newline at end of file
+  - perl-lwp-simple=6.15
diff --git a/workflow/envs/megahit.yml b/workflow/envs/megahit.yml
index abe57440..2144d4a0 100644
--- a/workflow/envs/megahit.yml
+++ b/workflow/envs/megahit.yml
@@ -4,4 +4,4 @@ channels:
   - defaults
 dependencies:
   - python=3.8.1
-  - megahit=1.2.9
\ No newline at end of file
+  - megahit=1.2.9
diff --git a/workflow/envs/metabat.yml b/workflow/envs/metabat.yml
index 4b44d966..094da655 100644
--- a/workflow/envs/metabat.yml
+++ b/workflow/envs/metabat.yml
@@ -4,4 +4,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - metabat2=2.14
\ No newline at end of file
+  - metabat2=2.14
diff --git a/workflow/envs/metaphlan.yml b/workflow/envs/metaphlan.yml
index ab3e45a7..5b7adde4 100644
--- a/workflow/envs/metaphlan.yml
+++ b/workflow/envs/metaphlan.yml
@@ -5,4 +5,4 @@ channels:
   - defaults
 dependencies:
   - python=3.7.6
-  - metaphlan=3.0
\ No newline at end of file
+  - metaphlan=3.0
diff --git a/workflow/envs/metaspades.yml b/workflow/envs/metaspades.yml
index 36c49084..7c45e1c6 100644
--- a/workflow/envs/metaspades.yml
+++ b/workflow/envs/metaspades.yml
@@ -4,4 +4,4 @@ channels:
   - defaults
 dependencies:
   - python=3.8.1
-  - spades=3.14.0
\ No newline at end of file
+  - spades=3.14.0
diff --git a/workflow/envs/plotting.yml b/workflow/envs/plotting.yml
index 1ed9df4a..1d619ec2 100644
--- a/workflow/envs/plotting.yml
+++ b/workflow/envs/plotting.yml
@@ -6,4 +6,4 @@ dependencies:
   - python=3.8.2
   - seaborn=0.10.1
   - pandas=1.0.3
-  - jupyter=1.0.0
\ No newline at end of file
+  - jupyter=1.0.0
diff --git a/workflow/envs/preprocess.yml b/workflow/envs/preprocess.yml
index 0a876c94..d3cd44ff 100644
--- a/workflow/envs/preprocess.yml
+++ b/workflow/envs/preprocess.yml
@@ -10,4 +10,4 @@ dependencies:
   - fastqc=0.11.9
   - multiqc=1.8
   - fastuniq=1.1
-  - bowtie2=2.3.5.1
\ No newline at end of file
+  - bowtie2=2.3.5.1
diff --git a/workflow/envs/quantify.yml b/workflow/envs/quantify.yml
index 73d39cf7..0d4626f4 100644
--- a/workflow/envs/quantify.yml
+++ b/workflow/envs/quantify.yml
@@ -7,4 +7,4 @@ dependencies:
   - subread=2.0.0
   - samtools=1.9
   - picard=2.21.9
-  - bowtie2=2.4.1
\ No newline at end of file
+  - bowtie2=2.4.1

From 63a564f7405958e71d9036e9a2e7837188b0971a Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:18:38 +0200
Subject: [PATCH 42/63] Formatting fixes after running pre-commit

---
 workflow/rules/assembly.smk       | 16 ++++++++--------
 workflow/rules/binning.smk        | 26 +++++++++++++-------------
 workflow/rules/classification.smk | 12 ++++++------
 workflow/rules/preprocessing.smk  |  8 ++++----
 workflow/rules/taxonomy.smk       |  4 ++--
 5 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/workflow/rules/assembly.smk b/workflow/rules/assembly.smk
index 35890f2a..416bfacc 100644
--- a/workflow/rules/assembly.smk
+++ b/workflow/rules/assembly.smk
@@ -77,7 +77,7 @@ if config["assembly"]["metaspades"]:
             metaspades.py \
                 -t {threads} -1 {input.R1} -2 {input.R2} $single \
                 -o {params.tmp} > {log} 2>&1
-            
+
             # If set to keep intermediate contigs, move to intermediate folder before deleting
             if [ "{config[metaspades][keep_intermediate]}" == "True" ]; then
                 mkdir -p {params.intermediate_contigs}
@@ -87,7 +87,7 @@ if config["assembly"]["metaspades"]:
                 mkdir -p {params.corrected}
                 cp -r {params.tmp}/corrected {params.corrected}
             fi
-            
+
             # Clear intermediate contigs
             rm -rf {params.tmp}/K*
             # Clear corrected reads dir
@@ -162,20 +162,20 @@ else:
             else
                 single=""
             fi
-            
+
             # Run Megahit
             megahit -t {threads} $paired $single -o {params.tmp} \
                 {params.additional_settings} >{log} 2>&1
-            
+
             # Sync intermediate contigs if asked for
             if [ "{config[megahit][keep_intermediate]}" == "True" ]; then
                 mkdir -p {params.intermediate_contigs}
                 cp -r {params.tmp}/intermediate_contigs/* {params.intermediate_contigs}
             fi
-            
+
             # Cleanup intermediate
             rm -rf {params.tmp}/intermediate_contigs
-            
+
             # Sync tmp output to outdir before removing
             cp -r {params.tmp}/* {params.output_dir}
             rm -rf {params.tmp}
@@ -277,7 +277,7 @@ rule bowtie_map_se:
     shell:
         """
         bowtie2 {params.setting} -p {threads} -x {params.prefix} \
-            -U {input.se} 2>{output.log} | samtools view -bh - | samtools sort - -o {params.temp_bam} 
+            -U {input.se} 2>{output.log} | samtools view -bh - | samtools sort - -o {params.temp_bam}
         samtools index {params.temp_bam}
         mv {params.temp_bam} {output.bam}
         mv {params.temp_bam}.bai {output.bai}
@@ -304,7 +304,7 @@ rule samtools_flagstat:
         "../envs/quantify.yml"
     shell:
         """
-        for f in {input} ; 
+        for f in {input} ;
         do
             al=$(samtools \
                 flagstat \
diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk
index c341f678..ddb2f339 100644
--- a/workflow/rules/binning.smk
+++ b/workflow/rules/binning.smk
@@ -9,7 +9,7 @@ localrules:
     contig_map,
     binning_stats,
     aggregate_binning_stats,
-    download_checkm, 
+    download_checkm,
     checkm_qa,
     remove_checkm_zerocols,
     aggregate_checkm_stats,
@@ -118,12 +118,12 @@ rule maxbin:
         else
             # Rename fasta files
             ls {params.tmp_dir} | grep ".fasta" | while read f;
-            do 
+            do
                 mv {params.tmp_dir}/$f {params.dir}/${{f%.fasta}}.fa
             done
             # Move output from temporary dir
             ls {params.tmp_dir} | while read f;
-            do 
+            do
                 mv {params.tmp_dir}/$f {params.dir}/
             done
         fi
@@ -154,11 +154,11 @@ rule concoct_coverage_table:
         p=POSTPROCESS
     shell:
         """
-        for f in {input.bam} ; 
-            do 
-                n=$(basename $f); 
-                s=$(echo -e $n | sed 's/_[ps]e{params.p}.bam//g'); 
-                echo $s; 
+        for f in {input.bam} ;
+            do
+                n=$(basename $f);
+                s=$(echo -e $n | sed 's/_[ps]e{params.p}.bam//g');
+                echo $s;
             done > {params.samplenames}
         concoct_coverage_table.py \
             --samplenames {params.samplenames} \
@@ -422,7 +422,7 @@ rule checkm_qa:
         else
             checkm qa -o 2 --tab_table -f {output.tsv} \
                 {input.ms} {params.dir} > {log} 2>&1
-        fi 
+        fi
         """
 
 rule checkm_coverage:
@@ -572,7 +572,7 @@ rule gtdbtk_classify:
 
 rule aggregate_gtdbtk:
     """
-    Aggregates GTDB-TK phylogenetic results from several assemblies into a 
+    Aggregates GTDB-TK phylogenetic results from several assemblies into a
     single table.
     """
     input:
@@ -634,7 +634,7 @@ rule barrnap:
                     barrnap --kingdom $k --quiet {params.indir}/$g.fa | \
                         egrep -v "^#" | sed "s/$/;genome=$g/g" >> {output}
                 done
-        fi              
+        fi
         """
 
 rule count_rRNA:
@@ -687,7 +687,7 @@ rule trnascan_bins:
                     tRNAscan-SE $model --quiet --thread {threads} \
                         {params.indir}/$g.fa | tail -n +4 | sed "s/$/\t$g/g" >> {output}
                 done
-        fi          
+        fi
         """
 
 rule count_tRNA:
@@ -704,7 +704,7 @@ rule count_tRNA:
 
 rule aggregate_bin_annot:
     input:
-        trna=expand(opj(config["paths"]["results"], "binning", "{binner}", 
+        trna=expand(opj(config["paths"]["results"], "binning", "{binner}",
                         "{assembly}", "{l}", "tRNAscan", "tRNA.total.tsv"),
                     binner=get_binners(config),
                     assembly=assemblies.keys(),
diff --git a/workflow/rules/classification.smk b/workflow/rules/classification.smk
index 453ab6fc..db728d6c 100644
--- a/workflow/rules/classification.smk
+++ b/workflow/rules/classification.smk
@@ -141,7 +141,7 @@ rule download_centrifuge_build:
         tar -C {params.dir} -xf {params.tar} >>{log} 2>&1
         rm {params.tar}
         """
-    
+
 rule centrifuge_pe:
     input:
         R1=opj(config["paths"]["results"], "intermediate", "preprocess",
@@ -212,13 +212,13 @@ rule centrifuge_se:
 
 rule centrifuge_kreport:
     input:
-        f=opj(config["paths"]["results"], "centrifuge", 
+        f=opj(config["paths"]["results"], "centrifuge",
               "{sample}_{unit}_{seq_type}.out"),
         db=expand(opj(config["centrifuge"]["dir"],
-                      "{base}.{i}.cf"), 
+                      "{base}.{i}.cf"),
                   i=[1, 2, 3], base=config["centrifuge"]["base"])
     output:
-        opj(config["paths"]["results"], "centrifuge", 
+        opj(config["paths"]["results"], "centrifuge",
             "{sample}_{unit}_{seq_type}.kreport")
     params:
         min_score=config["centrifuge"]["min_score"],
@@ -306,7 +306,7 @@ rule metaphlan_se:
     resources:
         runtime=lambda wildcards, attempt: attempt**2*60*4
     shell:
-        """       
+        """
         metaphlan {input.se} --bowtie2db {params.dir} --add_viruses --force \
             --nproc {threads} --input_type fastq -o {output.tsv} \
              --bowtie2out {output.bt2} > {log} 2>&1
@@ -365,7 +365,7 @@ rule plot_metaphlan:
         "../envs/plotting.yml"
     notebook:
         "../notebooks/metaphlan.py.ipynb"
-        
+
 ##### krona #####
 
 rule krona_taxonomy:
diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk
index 55e532cb..32ab1110 100644
--- a/workflow/rules/preprocessing.smk
+++ b/workflow/rules/preprocessing.smk
@@ -131,13 +131,13 @@ rule sortmerna_fastq_pe:
         "../envs/preprocess.yml"
     shell:
          """
-         mkdir -p {params.scratch} 
+         mkdir -p {params.scratch}
          # Run SortMeRNA
          sortmerna --blast 1 --log -v --fastx --ref {params.ref_string} \
             --reads {input.fastq} -a {threads} --{params.paired_strategy} \
             --aligned {params.aligned_prefix} --other {params.other_prefix} \
             {params.score_params} >{log} 2>&1
-         
+
          mv {params.aligned_prefix}.fastq {output.aligned}
          mv {params.aligned_prefix}.log {log}
          mv {params.other_prefix}.fastq {output.other}
@@ -257,7 +257,7 @@ rule sortmerna_fastq_se:
             --reads {input.fastq} -a {threads} --other {params.other_prefix} \
             --aligned {params.aligned_prefix} {params.score_params} \
             >{log} 2>&1
-        
+
         mv {params.aligned_prefix}.fastq {output.aligned}
         mv {params.aligned_prefix}.log {log}
         mv {params.other_prefix}.fastq {output.other}
@@ -360,7 +360,7 @@ rule trimmomatic_pe:
             2>{log.R1log}
         sed \
             's/{wildcards.sample}_{wildcards.unit}_R1/{wildcards.sample}_{wildcards.unit}_R2/g' \
-            {log.R1log} > {log.R2log}    
+            {log.R1log} > {log.R2log}
         """
 
 rule trimmomatic_se:
diff --git a/workflow/rules/taxonomy.smk b/workflow/rules/taxonomy.smk
index a5af315f..1db9ae81 100644
--- a/workflow/rules/taxonomy.smk
+++ b/workflow/rules/taxonomy.smk
@@ -125,7 +125,7 @@ rule contigtax_update:
         else
             cd {params.dir}
             ln -s $(basename {input.idmap}) $(basename {output.idmap})
-        fi            
+        fi
         """
 
 rule contigtax_build:
@@ -145,7 +145,7 @@ rule contigtax_build:
     shell:
          """
          contigtax build -d {output} -p {threads} {input.fasta} \
-            {input.idmap} {input.nodes} >{log} 2>&1 
+            {input.idmap} {input.nodes} >{log} 2>&1
          """
 
 rule contigtax_search:

From 1895a046687d4e44735f03d7ac839ee300941034 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:19:56 +0200
Subject: [PATCH 43/63] Formatting fixes after running pre-commit

---
 workflow/rules/annotation.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index 1a3c9e3e..67c5bd32 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -1,6 +1,6 @@
 from scripts.common import annotation_input
 
-localrules: 
+localrules:
     annotate,
     download_rfams,
     press_rfams,
@@ -59,7 +59,7 @@ rule trnascan:
     shell:
         """
         tRNAscan-SE -G -b {output.bed} -o {output.file} -a {output.fasta} \
-            --thread {threads} {input} >{log} 2>&1            
+            --thread {threads} {input} >{log} 2>&1
         """
 
 rule download_rfams:
@@ -85,11 +85,11 @@ rule download_rfams:
         # Extract only rfams of interest
         tar -C {params.dir} -zxf {output.tar} {params.rfams}
         cat {params.dir}/*.cm > {output.cm}
-        
+
         # Get release
         curl -o {output.readme} {params.url}/README 2>/dev/null
         grep -m 1 Release {output.readme} > {output.version}
-        
+
         # Get clans
         curl {params.url}/Rfam.clanin 2>/dev/null | egrep -w \
             "CL0011[123]" > {output.clanin}
@@ -287,7 +287,7 @@ rule emapper_homology_search:
     shell:
         """
         mkdir -p {params.tmpdir}
-        emapper.py {params.flags} --cpu {threads} --temp_dir {params.tmpdir} \ 
+        emapper.py {params.flags} --cpu {threads} --temp_dir {params.tmpdir} \
         -i {input[0]} -o {params.out} --output_dir {params.tmpdir} \
             --data_dir {params.resource_dir} >{log} 2>&1
         mv {params.tmp_out}.emapper.seed_orthologs {output[0]}

From e10586a3a88932633c233abe4e1ba2e14921ed35 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:22:20 +0200
Subject: [PATCH 44/63] Add pre-commit

---
 .editorconfig  | 1 -
 .gitattributes | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.editorconfig b/.editorconfig
index baf8ae55..5573fb60 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -13,4 +13,3 @@ indent_size = 4
 [*.{yml,yaml}]
 indent_style = space
 indent_size = 2
-
diff --git a/.gitattributes b/.gitattributes
index f3011696..74d81cc5 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,2 @@
 *.smk linguist-language=Python
 Snakefile linguist-language=Python
-

From dc33c7459982f7c244b0a6dc2622025b68a64da9 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:25:57 +0200
Subject: [PATCH 45/63] Add pre-commit badge

---
 README.md                                     |  4 +-
 workflow/envs/edger.yml                       |  9 +++
 .../envs/{normalize.yml => metagenomeseq.yml} |  6 +-
 workflow/scripts/common.R                     | 27 ++++++++
 workflow/scripts/edger.R                      | 36 ++++++++++
 workflow/scripts/metagenomeseq.R              | 35 ++++++++++
 workflow/scripts/normalize.R                  | 69 -------------------
 7 files changed, 113 insertions(+), 73 deletions(-)
 create mode 100644 workflow/envs/edger.yml
 rename workflow/envs/{normalize.yml => metagenomeseq.yml} (60%)
 create mode 100644 workflow/scripts/common.R
 create mode 100644 workflow/scripts/edger.R
 create mode 100644 workflow/scripts/metagenomeseq.R
 delete mode 100644 workflow/scripts/normalize.R

diff --git a/README.md b/README.md
index 4ffc68f8..7a13be70 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ A workflow for metagenomic projects
 [![Snakemake 5.11.2](https://img.shields.io/badge/snakemake-5.11.2-brightgreen.svg)](https://img.shields.io/badge/snakemake-5.11.2)
 ![CI](https://github.com/NBISweden/nbis-meta/workflows/CI/badge.svg?branch=master)
 ![Docker](https://img.shields.io/docker/pulls/nbisweden/nbis-meta)
-
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 
 ## Overview
 A [snakemake](http://snakemake.readthedocs.io/en/stable/) workflow for
@@ -19,7 +19,7 @@ You can use this workflow for _e.g._:
 - **functional and taxonomic annotation**
 - **metagenomic binning**
 
-See the [Wiki-pages](https://github.com/NBISweden/nbis-meta/wiki) for 
+See the [Wiki-pages](https://github.com/NBISweden/nbis-meta/wiki) for
 instructions on how to run the workflow.
 
 ## Installation
diff --git a/workflow/envs/edger.yml b/workflow/envs/edger.yml
new file mode 100644
index 00000000..ec95a87f
--- /dev/null
+++ b/workflow/envs/edger.yml
@@ -0,0 +1,9 @@
+name: edger
+channels:
+  - conda-forge
+  - bioconda
+  - r
+  - defaults
+dependencies:
+  - r-base
+  - bioconductor-edger=3.30.0
diff --git a/workflow/envs/normalize.yml b/workflow/envs/metagenomeseq.yml
similarity index 60%
rename from workflow/envs/normalize.yml
rename to workflow/envs/metagenomeseq.yml
index f60900c0..559d88d8 100644
--- a/workflow/envs/normalize.yml
+++ b/workflow/envs/metagenomeseq.yml
@@ -1,7 +1,9 @@
-name: normalize
+name: metagenomeseq
 channels:
+  - conda-forge
   - bioconda
+  - r
   - defaults
 dependencies:
-  - bioconductor-edger
+  - r-base
   - bioconductor-metagenomeseq
diff --git a/workflow/scripts/common.R b/workflow/scripts/common.R
new file mode 100644
index 00000000..16dd914d
--- /dev/null
+++ b/workflow/scripts/common.R
@@ -0,0 +1,27 @@
+
+num_cols <- function(x) {
+    x <- x[, unlist(lapply(x, is.numeric))]
+    x
+}
+
+str_cols <- function(x) {
+    x <- x[, unlist(lapply(x, is.character))]
+    x
+}
+
+process_data <- function(input, output) {
+    # Read the counts
+    x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE)
+    # Remove unclassified
+    if ("Unclassified" %in% row.names(x)){
+        x <- x[row.names(x)!="Unclassified", ]
+    }
+    if (nrow(x)==0) {
+        write.table(x, file = output, quote = FALSE, sep="\t")
+        quit()
+    }
+
+    # Extract only numeric columns
+    x_num <- num_cols(x)
+    x_num
+}
diff --git a/workflow/scripts/edger.R b/workflow/scripts/edger.R
new file mode 100644
index 00000000..ae5d3ed6
--- /dev/null
+++ b/workflow/scripts/edger.R
@@ -0,0 +1,36 @@
+#!/usr/bin/env Rscript
+
+library(edgeR)
+source("common.R")
+
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
+method <- snakemake@params$method
+input <- snakemake@input[[1]]
+output <- snakemake@output[[1]]
+
+x_num <- process_data(input, output)
+
+if (method %in% c("TMM", "RLE")) {
+    # Create DGE
+    obj <- DGEList(x_num)
+    # Calculate norm factors
+    obj <- calcNormFactors(obj, method = method)
+    # Calculate cpms
+    norm <- cpm(obj, normalized.lib.sizes = TRUE)
+} else if (method == "RPKM") {
+    # Extract gene length column
+    gene_length <- x_num$Length
+    x_num <- x_num[, colnames(x_num) != "Length"]
+    obj <- DGEList(x_num)
+    # Calculate norm factors
+    obj <- calcNormFactors(obj, method = "TMM")
+    # Calculate RPKM
+    norm <- rpkm(obj, normalized.lib.sizes = TRUE, gene.length = gene_length)
+}
+
+norm <- cbind(str_cols(x), norm)
+
+write.table(x = norm, file = output, quote = FALSE, sep="\t")
diff --git a/workflow/scripts/metagenomeseq.R b/workflow/scripts/metagenomeseq.R
new file mode 100644
index 00000000..3fbf4732
--- /dev/null
+++ b/workflow/scripts/metagenomeseq.R
@@ -0,0 +1,35 @@
+#!/usr/bin/env Rscript
+
+library(metagenomeSeq)
+source("common.R")
+
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
+method <- snakemake@params$method
+input <- snakemake@input[[1]]
+output <- snakemake@output[[1]]
+
+x_num <- process_data(input, output)
+
+obj <- newMRexperiment(x_num)
+smat = lapply(1:ncol(x_num), function(i) {
+    sort(x_num[which(x_num[, i]>0),i], decreasing = TRUE)
+})
+if(any(sapply(smat,length)==1)) {
+    fh <-file(snakemake@output[[1]])
+    writeLines(c("WARNING: Sample with one or zero features",
+                 "Cumulative Sum Scaling failed for sample"), fh)
+    close(fh)
+    quit()
+}
+norm <- MRcounts(obj, norm = TRUE)
+
+norm <- cbind(str_cols(x), norm)
+
+write.table(x = norm, file = output, quote = FALSE, sep="\t")
diff --git a/workflow/scripts/normalize.R b/workflow/scripts/normalize.R
deleted file mode 100644
index 66bdb489..00000000
--- a/workflow/scripts/normalize.R
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env Rscript
-
-log <- file(snakemake@log[[1]], open="wt")
-sink(log)
-sink(log, type="message")
-
-num_cols <- function(x) {
-    x <- x[, unlist(lapply(x, is.numeric))]
-    x
-}
-
-str_cols <- function(x) {
-    x <- x[, unlist(lapply(x, is.character))]
-    x
-}
-
-method <- snakemake@params$method
-
-# Read the counts
-x <- read.delim(snakemake@input[[1]], row.names = 1, sep = "\t", header = TRUE)
-# Remove unclassified
-if ("Unclassified" %in% row.names(x)){
-    x <- x[row.names(x)!="Unclassified", ]
-}
-if (nrow(x)==0) {
-    write.table(x, file = snakemake@output[[1]], quote = FALSE, sep="\t")
-    quit()
-}
-
-# Extract only numeric columns
-x_num <- num_cols(x)
-
-if (method %in% c("TMM", "RLE")) {
-    library(edgeR)
-    # Create DGE
-    obj <- DGEList(x_num)
-    # Calculate norm factors
-    obj <- calcNormFactors(obj, method = method)
-    # Calculate cpms
-    norm <- cpm(obj, normalized.lib.sizes = TRUE)
-} else if (method == "RPKM") {
-    library(edgeR)
-    # Extract gene length column
-    gene_length <- x_num$Length
-    x_num <- x_num[, colnames(x_num) != "Length"]
-    obj <- DGEList(x_num)
-    # Calculate norm factors
-    obj <- calcNormFactors(obj, method = "TMM")
-    # Calculate RPKM
-    norm <- rpkm(obj, normalized.lib.sizes = TRUE, gene.length = gene_length)
-} else {
-    library(metagenomeSeq)
-    obj <- newMRexperiment(x_num)
-    smat = lapply(1:ncol(x_num), function(i) {
-	    sort(x_num[which(x_num[, i]>0),i], decreasing = TRUE)
-	})
-    if(any(sapply(smat,length)==1)) {
-        fh <-file(snakemake@output[[1]])
-        writeLines(c("WARNING: Sample with one or zero features",
-                     "Cumulative Sum Scaling failed for sample"), fh)
-        close(fh)
-        quit()
-    }
-    norm <- MRcounts(obj, norm = TRUE)
-}
-
-norm <- cbind(str_cols(x), norm)
-
-write.table(x = norm, file = snakemake@output[[1]], quote = FALSE, sep="\t")

From 557419a60272982eee50c001e577d5e44fa7070a Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:26:37 +0200
Subject: [PATCH 46/63] Fix whitespace

---
 workflow/scripts/eggnog-parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/scripts/eggnog-parser.py b/workflow/scripts/eggnog-parser.py
index 292ebbca..1875593f 100644
--- a/workflow/scripts/eggnog-parser.py
+++ b/workflow/scripts/eggnog-parser.py
@@ -335,7 +335,7 @@ def main():
                               help="emapper.py annotation files (Typically *.emapper.annotations)")
     annot_parser.add_argument("outdir",
                               help="Output directory for parsed files")
-    annot_parser.add_argument("--map_go", action="store_true", 
+    annot_parser.add_argument("--map_go", action="store_true",
                               help="Also map GO terms (can take a long time)")
     annot_parser.set_defaults(func=parse)
     # Sum annotations parser

From 3ca01b78a12820594a1df53ca7fceec9ed30d0c2 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:28:26 +0200
Subject: [PATCH 47/63] Merge RGI testing with metaspades

---
 .github/workflows/main.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c97a0034..55089cfd 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -50,12 +50,9 @@ jobs:
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport
     - name: Metaspades + RGI
-      run: |
-        snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p assemble
-        rm -r results/assembly results/report examples/data/sample*
-    - name: RGI
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate
+        rm -r results/assembly results/report examples/data/sample*
     - name: Prepare taxonomy
       run: bash .test/scripts/prep_taxonomy.sh
     - name: Taxonomy

From d7586660b2cf45d2a3bdb22129fb2b2e4573fc8f Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:28:49 +0200
Subject: [PATCH 48/63] Fix whitespace

---
 .test/README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.test/README.md b/.test/README.md
index 862d0942..01ce1483 100644
--- a/.test/README.md
+++ b/.test/README.md
@@ -1,24 +1,23 @@
 # Testing
 
-This directory contains config files, resources and scripts used to test the 
+This directory contains config files, resources and scripts used to test the
 workflow.
 
 **config/**
 
-Files under `config/` are used by the different test steps of the github 
+Files under `config/` are used by the different test steps of the github
 actions testing.
 
 **data/**
 
 The fasta file at `data/uniref100.fasta` was created by:
 
-1. searching [uniprot](https://uniprot.org) for the 5 taxids used to generate 
+1. searching [uniprot](https://uniprot.org) for the 5 taxids used to generate
 the [synthetic metagenome](https://zenodo.org/record/3737112#.XsUQncZ8LOQ) that
 this workflow uses for testing. This resulted in 8,122 identified sequences.
 2. mapping the sequences to their UniRef100 id via the Retrieve/ID mapping tool
 at uniprot, followed by downloading of the reference sequences.
 3. subsampling 100 sequences from the downloaded fastafile using `seqtk
 
-During testing the fasta file is used to build and query a diamond database 
-using `tango`. 
-
+During testing the fasta file is used to build and query a diamond database
+using `tango`.

From 017ce3b1afce8c4d33311bde3bba238735387511 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:29:54 +0200
Subject: [PATCH 49/63] Fix whitespace

---
 slurm/slurm_utils.py                         | 4 ++--
 workflow/.cfg/multiqc_preprocess_config.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/slurm/slurm_utils.py b/slurm/slurm_utils.py
index a420d1c3..a8b5e47b 100644
--- a/slurm/slurm_utils.py
+++ b/slurm/slurm_utils.py
@@ -59,7 +59,7 @@ def format(_pattern, _quote_all=False, **kwargs):
 #  adapted from Job.format_wildcards in snakemake.jobs
 def format_wildcards(string, job_properties):
     """ Format a string with variables from the job. """
-    
+
     class Job(object):
         def __init__(self, job_properties):
             for key in job_properties:
@@ -100,7 +100,7 @@ def format_values(dictionary, job_properties):
                 )
                 raise WorkflowError(msg, e)
     return formatted
-    
+
 def convert_job_properties(job_properties, resource_mapping={}):
     options = {}
     resources = job_properties.get("resources", {})
diff --git a/workflow/.cfg/multiqc_preprocess_config.yaml b/workflow/.cfg/multiqc_preprocess_config.yaml
index c1bdb4d9..3deab64b 100644
--- a/workflow/.cfg/multiqc_preprocess_config.yaml
+++ b/workflow/.cfg/multiqc_preprocess_config.yaml
@@ -9,4 +9,4 @@ extra_fn_clean_exts:
     - '_PHIX'
     - '.sortmerna'
     - '_R1'
-    - '_R2'
\ No newline at end of file
+    - '_R2'

From 77a932a190c1e49a739da3587767d5197f152f2f Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:37:29 +0200
Subject: [PATCH 50/63] Update path to common.R

---
 workflow/scripts/edger.R         | 5 ++---
 workflow/scripts/metagenomeseq.R | 8 ++------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/workflow/scripts/edger.R b/workflow/scripts/edger.R
index ae5d3ed6..5b09a870 100644
--- a/workflow/scripts/edger.R
+++ b/workflow/scripts/edger.R
@@ -1,12 +1,11 @@
 #!/usr/bin/env Rscript
 
-library(edgeR)
-source("common.R")
-
 log <- file(snakemake@log[[1]], open="wt")
 sink(log)
 sink(log, type="message")
+source("workflow/scripts/common.R")
 
+library(edgeR)
 method <- snakemake@params$method
 input <- snakemake@input[[1]]
 output <- snakemake@output[[1]]
diff --git a/workflow/scripts/metagenomeseq.R b/workflow/scripts/metagenomeseq.R
index 3fbf4732..fbcfe955 100644
--- a/workflow/scripts/metagenomeseq.R
+++ b/workflow/scripts/metagenomeseq.R
@@ -1,15 +1,11 @@
 #!/usr/bin/env Rscript
 
-library(metagenomeSeq)
-source("common.R")
-
 log <- file(snakemake@log[[1]], open="wt")
 sink(log)
 sink(log, type="message")
 
-log <- file(snakemake@log[[1]], open="wt")
-sink(log)
-sink(log, type="message")
+library(metagenomeSeq)
+source("workflow/scripts/common.R")
 
 method <- snakemake@params$method
 input <- snakemake@input[[1]]

From 7998fc597deb543021feaaf50c226a30d5904e15 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 12:54:53 +0200
Subject: [PATCH 51/63] Split metagenomeseq and edger into separate scripts to
 help with conda installation

---
 workflow/rules/common.smk         |  2 +-
 workflow/rules/quantification.smk | 30 +++++++++++++++++++++++-------
 workflow/scripts/common.R         |  9 +--------
 workflow/scripts/edger.R          |  9 ++++++++-
 workflow/scripts/metagenomeseq.R  |  8 +++++++-
 5 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 5ec04ede..7012e079 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -37,7 +37,7 @@ wildcard_constraints:
     group="\w+",
     l="\d+",
     counts_type="(counts|rpkm)",
-    norm_method="(TMM|CSS|RLE)"
+    norm_method="(TMM|RLE)"
 
 from scripts.common import check_uppmax, check_annotation, check_assembly, check_classifiers
 
diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
index 95bc40f8..aa656363 100644
--- a/workflow/rules/quantification.smk
+++ b/workflow/rules/quantification.smk
@@ -5,7 +5,8 @@ localrules:
     aggregate_featurecount,
     rpkm,
     count_features,
-    normalize_features,
+    edger_normalize_features,
+    css_normalize_features,
     sum_to_taxa
 
 ##### quantify master rule #####
@@ -137,9 +138,9 @@ rule rpkm:
     params:
         method = "RPKM"
     conda:
-        "../envs/normalize.yml"
+        "../envs/edger.yml"
     script:
-        "../scripts/normalize.R"
+        "../scripts/edger.R"
 
 rule count_features:
     """
@@ -153,9 +154,9 @@ rule count_features:
     script:
         "../scripts/quantification_utils.py"
 
-rule normalize_features:
+rule edger_normalize_features:
     """
-    Normalizes counts of features using TMM, REL or CSS
+    Normalizes counts of features using TMM and REL
     """
     input:
         opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv")
@@ -166,9 +167,24 @@ rule normalize_features:
     params:
         method = "{norm_method}"
     conda:
-        "../envs/normalize.yml"
+        "../envs/edger.yml"
     script:
-        "../scripts/normalize.R"
+        "../scripts/edger.R"
+
+rule css_normalize_features:
+    """
+    Normalizes counts of features using CSS from metagenomeSeq
+    """
+    input:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.counts.tsv")
+    output:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.CSS.tsv")
+    log:
+        opj(config["paths"]["results"], "annotation", "{assembly}", "{db}.parsed.CSS.log")
+    conda:
+        "../envs/metagenomeseq.yml"
+    script:
+        "../scripts/metagenomeseq.R"
 
 rule sum_to_taxa:
     """
diff --git a/workflow/scripts/common.R b/workflow/scripts/common.R
index 16dd914d..a08c3698 100644
--- a/workflow/scripts/common.R
+++ b/workflow/scripts/common.R
@@ -9,18 +9,11 @@ str_cols <- function(x) {
     x
 }
 
-process_data <- function(input, output) {
-    # Read the counts
-    x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE)
-    # Remove unclassified
-    if ("Unclassified" %in% row.names(x)){
-        x <- x[row.names(x)!="Unclassified", ]
-    }
+process_data <- function(x, output) {
     if (nrow(x)==0) {
         write.table(x, file = output, quote = FALSE, sep="\t")
         quit()
     }
-
     # Extract only numeric columns
     x_num <- num_cols(x)
     x_num
diff --git a/workflow/scripts/edger.R b/workflow/scripts/edger.R
index 5b09a870..5c9ac1df 100644
--- a/workflow/scripts/edger.R
+++ b/workflow/scripts/edger.R
@@ -10,7 +10,14 @@ method <- snakemake@params$method
 input <- snakemake@input[[1]]
 output <- snakemake@output[[1]]
 
-x_num <- process_data(input, output)
+# Read the counts
+x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE)
+# Remove unclassified
+if ("Unclassified" %in% row.names(x)){
+    x <- x[row.names(x)!="Unclassified", ]
+}
+
+x_num <- process_data(x, output)
 
 if (method %in% c("TMM", "RLE")) {
     # Create DGE
diff --git a/workflow/scripts/metagenomeseq.R b/workflow/scripts/metagenomeseq.R
index fbcfe955..1fab5f04 100644
--- a/workflow/scripts/metagenomeseq.R
+++ b/workflow/scripts/metagenomeseq.R
@@ -10,8 +10,14 @@ source("workflow/scripts/common.R")
 method <- snakemake@params$method
 input <- snakemake@input[[1]]
 output <- snakemake@output[[1]]
+# Read the counts
+x <- read.delim(input, row.names = 1, sep = "\t", header = TRUE)
+# Remove unclassified
+if ("Unclassified" %in% row.names(x)){
+    x <- x[row.names(x)!="Unclassified", ]
+}
 
-x_num <- process_data(input, output)
+x_num <- process_data(x, output)
 
 obj <- newMRexperiment(x_num)
 smat = lapply(1:ncol(x_num), function(i) {

From 2777be7a33f590fd596c46e2ab7c0585cb6227c2 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 14:22:58 +0200
Subject: [PATCH 52/63] Remove trailing '*' in protein fasta

---
 workflow/rules/annotation.smk | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index 67c5bd32..c8f1b9f6 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -411,7 +411,9 @@ rule rgi:
         opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.log")
     params:
         out=opj(config["paths"]["results"], "annotation", "{assembly}", "rgi.out"),
-        settings="-a diamond --local --clean --input_type protein"
+        settings="-a diamond --local --clean --input_type protein",
+        faa=opj(config["paths"]["temp"], "{assembly}.rgi", "final_contig.faa"),
+        tmpdir=opj(config["paths"]["temp"], "{assembly}.rgi")
     shadow: "minimal"
     conda:
         "../envs/rgi.yml"
@@ -420,9 +422,12 @@ rule rgi:
         runtime=lambda wildcards, attempt: attempt**2*60
     shell:
         """
+        mkdir -p {params.tmpdir}
         rgi load -i {input.db} --local > {log} 2>&1
-        rgi main -i {input.faa} -o {params.out} \
+        sed 's/*//g' {input.faa} > {params.faa}
+        rgi main -i {params.faa} -o {params.out} \
             -n {threads} {params.settings} >>{log} 2>>{log}
+        rm -r {params.tmpdir}
         """
 
 rule parse_rgi:

From 9095a828b02e4b076cc74176c393499eb72295ed Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 14:59:44 +0200
Subject: [PATCH 53/63] Add testing for eggnog-mapper as well

---
 .github/workflows/main.yml   | 3 ++-
 .test/config/metaspades.yaml | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 55089cfd..6c8dfd4c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -49,8 +49,9 @@ jobs:
     - name: Kraken
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport
-    - name: Metaspades + RGI
+    - name: Metaspades + annotation
       run: |
+        bash .test/scripts/prep_eggnog.sh
         snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate
         rm -r results/assembly results/report examples/data/sample*
     - name: Prepare taxonomy
diff --git a/.test/config/metaspades.yaml b/.test/config/metaspades.yaml
index e619bec8..f93d9174 100644
--- a/.test/config/metaspades.yaml
+++ b/.test/config/metaspades.yaml
@@ -119,7 +119,7 @@ annotation:
   # run infernal for rRNA identification?
   infernal: False
   # run eggnog-mapper to infer KEGG orthologs, pathways and modules?
-  eggnog: False
+  eggnog: True
   # run PFAM-scan to infer protein families from PFAM?
   pfam: False
   # run Resistance gene identifier?

From a3fb03e148b8fb61e1a8b367aee7e42ae3b87d63 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Tue, 13 Oct 2020 15:00:22 +0200
Subject: [PATCH 54/63] Add script for prepping test db for eggnog-mapper

---
 .test/scripts/prep_eggnog.sh | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100755 .test/scripts/prep_eggnog.sh

diff --git a/.test/scripts/prep_eggnog.sh b/.test/scripts/prep_eggnog.sh
new file mode 100755
index 00000000..6ffac7a5
--- /dev/null
+++ b/.test/scripts/prep_eggnog.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+base="https://github.com/eggnogdb/eggnog-mapper/blob/master/tests/fixtures"
+
+mkdir -p resources/eggnog-mapper
+curl -L -o resources/eggnog-mapper/eggnog.db "${base}/eggnog.db?raw=true"
+curl -L -o resources/eggnog-mapper/eggnog_proteins.dmnd "${base}/eggnog_proteins.dmnd?raw=true"
+touch resources/eggnog-mapper/eggnog.version

From fefc773ba00d35f2f552527e6384cb4345c9b241 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 14 Oct 2020 10:59:21 +0200
Subject: [PATCH 55/63] Add TMPDIR to env for Github Actions

---
 .github/workflows/main.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6c8dfd4c..a68c2990 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,6 +13,8 @@ on:
       - 'LICENSE'
     branches:
       - master
+env:
+  TMPDIR: /tmp
 
 jobs:
   test:

From bb2f532e521a157b73afae12ef9e85b97a3a2cbe Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 14 Oct 2020 11:29:56 +0200
Subject: [PATCH 56/63] Set phiX filtering to false for testing

---
 .test/config/metaspades.yaml | 2 +-
 .test/config/preprocess.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.test/config/metaspades.yaml b/.test/config/metaspades.yaml
index f93d9174..aa5444ad 100644
--- a/.test/config/metaspades.yaml
+++ b/.test/config/metaspades.yaml
@@ -20,7 +20,7 @@ preprocessing:
   # run fastuniq (removes duplicates from paired-end samples)
   fastuniq: True
   # map reads agains the phix genome and keep only reads that do not map concordantly
-  phix_filter: True
+  phix_filter: False
   # run SortMeRNA to identify (and filter) rRNA sequences
   sortmerna: True
 
diff --git a/.test/config/preprocess.yaml b/.test/config/preprocess.yaml
index a2a2e6b7..4d56d3d9 100644
--- a/.test/config/preprocess.yaml
+++ b/.test/config/preprocess.yaml
@@ -20,7 +20,7 @@ preprocessing:
   # run fastuniq (removes duplicates from paired-end samples)
   fastuniq: True
   # map reads agains the phix genome and keep only reads that do not map concordantly
-  phix_filter: True
+  phix_filter: False
   # run SortMeRNA to identify (and filter) rRNA sequences
   sortmerna: True
 

From f2315c9efc04e0b17264429c25b15daa7932e5a0 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 14 Oct 2020 14:12:49 +0200
Subject: [PATCH 57/63] Remove message from sample linking

---
 workflow/rules/preprocessing.smk | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk
index 32ab1110..b06645e6 100644
--- a/workflow/rules/preprocessing.smk
+++ b/workflow/rules/preprocessing.smk
@@ -27,7 +27,6 @@ rule link_files:
         lambda wildcards: samples[wildcards.sample][wildcards.unit][wildcards.pair]
     output:
         opj(config["paths"]["results"], "intermediate", "preprocess", "{sample}_{unit}_{pair}.fastq.gz")
-    message: "Linking {wildcards.sample}_{wildcards.unit}_{wildcards.pair}.fastq.gz"
     run:
         link(input[0], output[0])
 

From 721f7375e257456916e3f19e0b6a5ffc51148a23 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Wed, 14 Oct 2020 16:44:10 +0200
Subject: [PATCH 58/63] Update config files for testing

---
 .github/workflows/main.yml   | 25 +++++++++++++++++--------
 .test/config/metabat.yaml    |  2 +-
 .test/config/preprocess.yaml |  2 +-
 .test/config/taxonomy.yaml   | 10 +++++-----
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a68c2990..9dea8b2c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -27,45 +27,50 @@ jobs:
     - uses: s-weigand/setup-conda@v1
     - name: Install the conda environment
       run: conda env update -n base -f environment.yml
+    # First dry-run on full workflow
     - name: Dry run
       run: snakemake --use-conda -j 4 --configfile .test/config/dry-run.yaml -n
+    # Test cutadapt (10k reads/sample)
     - name: Cutadapt
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/cutadapt.yaml -p --notemp qc
         snakemake -j 1 --configfile .test/config/cutadapt.yaml --report report.html qc
+    # Upload sample report
     - name: Upload cutadapt report
       uses: actions/upload-artifact@v1
       with:
         name: ${{ runner.os }}-cutadapt.html
         path: report.html
+    # Test all other preprocessing software except cutadapt (10k reads/sample)
     - name: Preprocess
       run: |
         rm -rf results report.html
         snakemake --use-conda -j 4 --configfile .test/config/preprocess.yaml -p qc
         snakemake --use-conda -j 4 --configfile .test/config/preprocess.yaml -p --report report.html qc
+    # Upload samples report
     - name: Upload samples report
       uses: actions/upload-artifact@v1
       with:
         name: ${{ runner.os }}-samples_report.html
         path: report.html
+    # Run kraken for sample1 (paired-end) and sample4 (single-end) (10k reads/sample)
     - name: Kraken
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport
-    - name: Metaspades + annotation
+        rm -rf results examples/data/sample*
+    # Test annotations + normalizations with metaspades (100k reads/sample)
+    - name: Annotation
       run: |
         bash .test/scripts/prep_eggnog.sh
-        snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p annotate
+        bash .test/scripts/prep_taxonomy.sh
+        snakemake --use-conda -j 4 --configfile .test/config/annotate.yaml -p annotate
         rm -r results/assembly results/report examples/data/sample*
-    - name: Prepare taxonomy
-      run: bash .test/scripts/prep_taxonomy.sh
-    - name: Taxonomy
-      run: |
-        snakemake --use-conda -j 4 --configfile .test/config/taxonomy.yaml -p taxonomy
-        rm -r results examples/data/sample*
+    # Test Metabat2 with Megahit (200k reads/sample)
     - name: Metabat
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/metabat.yaml -p bin assemble
         snakemake --use-conda -j 4 --configfile .test/config/metabat.yaml -p --report report.html bin assemble
+    # Upload report
     - name: Upload snakemake report
       uses: actions/upload-artifact@v1
       with:
@@ -89,16 +94,20 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-${{ env.cache-name }}
           ${{ runner.os }}-
+    # Run kraken and produce final output reports (linux only) (10k reads/sample)
     - name: Kraken
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml --notemp -p classify
         rm -r examples/data/sample* results
+    # Run binning including also checkm (200k reads/sample)
     - name: Binning
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/binning.yaml -p assemble bin
+    # Create report for assembly and binning
     - name: Snakemake report
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/binning.yaml -p --report report.html assemble bin
+    # Upload report
     - name: Upload snakemake report
       uses: actions/upload-artifact@v1
       with:
diff --git a/.test/config/metabat.yaml b/.test/config/metabat.yaml
index 03f0e3d8..14196c45 100644
--- a/.test/config/metabat.yaml
+++ b/.test/config/metabat.yaml
@@ -16,7 +16,7 @@ preprocessing:
   # trim reads with trimmomatic? (quality and adapter trimming)
   trimmomatic: False
   # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming)
-  cutadapt: True
+  cutadapt: False
   # run fastuniq (removes duplicates from paired-end samples)
   fastuniq: False
   # map reads agains the phix genome and keep only reads that do not map concordantly
diff --git a/.test/config/preprocess.yaml b/.test/config/preprocess.yaml
index 4d56d3d9..a2a2e6b7 100644
--- a/.test/config/preprocess.yaml
+++ b/.test/config/preprocess.yaml
@@ -20,7 +20,7 @@ preprocessing:
   # run fastuniq (removes duplicates from paired-end samples)
   fastuniq: True
   # map reads agains the phix genome and keep only reads that do not map concordantly
-  phix_filter: False
+  phix_filter: True
   # run SortMeRNA to identify (and filter) rRNA sequences
   sortmerna: True
 
diff --git a/.test/config/taxonomy.yaml b/.test/config/taxonomy.yaml
index 8f6562cd..2f8942b1 100644
--- a/.test/config/taxonomy.yaml
+++ b/.test/config/taxonomy.yaml
@@ -12,9 +12,9 @@ paths:
 preprocessing:
   # run fastqc on (preprocessed) input?
   # if no preprocessing is done, fastqc will be run on the raw reads
-  fastqc: True
+  fastqc: False
   # trim reads with trimmomatic? (quality and adapter trimming)
-  trimmomatic: True
+  trimmomatic: False
   # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming)
   cutadapt: False
   # run fastuniq (removes duplicates from paired-end samples)
@@ -87,13 +87,13 @@ sortmerna:
   extra_settings: "--num_alignments 1"
 
 # remove duplicates from bam files?
-remove_duplicates: True
+remove_duplicates: False
 
 assembly:
   # run Megahit assembler?
-  megahit: False
+  megahit: True
   # Use Metaspades instead of Megahit for assembly?
-  metaspades: True
+  metaspades: False
 
 megahit:
   # maximum threads for megahit

From 806f6a677b2c62284ed183c36b372f3cae33eb56 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 15 Oct 2020 09:57:39 +0200
Subject: [PATCH 59/63] Rename test config

---
 .test/config/{metaspades.yaml => annotate.yaml} | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 rename .test/config/{metaspades.yaml => annotate.yaml} (98%)

diff --git a/.test/config/metaspades.yaml b/.test/config/annotate.yaml
similarity index 98%
rename from .test/config/metaspades.yaml
rename to .test/config/annotate.yaml
index aa5444ad..45643473 100644
--- a/.test/config/metaspades.yaml
+++ b/.test/config/annotate.yaml
@@ -14,15 +14,15 @@ preprocessing:
   # if no preprocessing is done, fastqc will be run on the raw reads
   fastqc: False
   # trim reads with trimmomatic? (quality and adapter trimming)
-  trimmomatic: True
+  trimmomatic: False
   # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming)
   cutadapt: False
   # run fastuniq (removes duplicates from paired-end samples)
-  fastuniq: True
+  fastuniq: False
   # map reads agains the phix genome and keep only reads that do not map concordantly
   phix_filter: False
   # run SortMeRNA to identify (and filter) rRNA sequences
-  sortmerna: True
+  sortmerna: False
 
 # parameters for trimmomatic
 trimmomatic:
@@ -87,11 +87,11 @@ sortmerna:
   extra_settings: "--num_alignments 1"
 
 # remove duplicates from bam files?
-remove_duplicates: True
+remove_duplicates: False
 
 assembly:
   # run Megahit assembler?
-  megahit: True
+  megahit: False
   # Use Metaspades instead of Megahit for assembly?
   metaspades: True
 
@@ -125,7 +125,7 @@ annotation:
   # run Resistance gene identifier?
   rgi: True
   # run taxonomic annotation of assembled contigs (using tango + sourmash)?
-  taxonomy: False
+  taxonomy: True
 
 # params for taxonomic annotation of contigs/orfs
 taxonomy:
@@ -262,4 +262,4 @@ metaphlan:
 # this sets the number of reads to generate for example input files
 # the files are generated based on the config/samples.tsv file and stored
 # under examples/data
-example_dataset_size: 10000
+example_dataset_size: 100000

From bd85465beb9b5849549fb366e9656d605c9c4332 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 15 Oct 2020 10:54:51 +0200
Subject: [PATCH 60/63] Minor update to github workflow

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9dea8b2c..4780c750 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -64,7 +64,7 @@ jobs:
         bash .test/scripts/prep_eggnog.sh
         bash .test/scripts/prep_taxonomy.sh
         snakemake --use-conda -j 4 --configfile .test/config/annotate.yaml -p annotate
-        rm -r results/assembly results/report examples/data/sample*
+        rm -r results/assembly examples/data/sample*
     # Test Metabat2 with Megahit (200k reads/sample)
     - name: Metabat
       run: |

From d9ba7cf5c14db0a02ecf0c3253e28570abd02cee Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 15 Oct 2020 12:16:47 +0200
Subject: [PATCH 61/63] Set markduplicates to True for annotation testing

---
 .test/config/annotate.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.test/config/annotate.yaml b/.test/config/annotate.yaml
index 45643473..0418d8b3 100644
--- a/.test/config/annotate.yaml
+++ b/.test/config/annotate.yaml
@@ -87,7 +87,7 @@ sortmerna:
   extra_settings: "--num_alignments 1"
 
 # remove duplicates from bam files?
-remove_duplicates: False
+remove_duplicates: True
 
 assembly:
   # run Megahit assembler?

From b29407490c836fae6ba4bb9d0205c0bf4abc3c57 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 15 Oct 2020 12:30:19 +0200
Subject: [PATCH 62/63] Run annotation with 2 cores

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 4780c750..ab50bb07 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -63,7 +63,7 @@ jobs:
       run: |
         bash .test/scripts/prep_eggnog.sh
         bash .test/scripts/prep_taxonomy.sh
-        snakemake --use-conda -j 4 --configfile .test/config/annotate.yaml -p annotate
+        snakemake --use-conda -j 2 --configfile .test/config/annotate.yaml -p annotate
         rm -r results/assembly examples/data/sample*
     # Test Metabat2 with Megahit (200k reads/sample)
     - name: Metabat

From eabf34fdffd2e741d0a10cf2f72b1c5b1e60e601 Mon Sep 17 00:00:00 2001
From: johnne <john.sundh@scilifelab.se>
Date: Thu, 15 Oct 2020 12:30:40 +0200
Subject: [PATCH 63/63] Don't run markduplicates for metabat testing

---
 .test/config/metabat.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.test/config/metabat.yaml b/.test/config/metabat.yaml
index 14196c45..ecc8c616 100644
--- a/.test/config/metabat.yaml
+++ b/.test/config/metabat.yaml
@@ -87,7 +87,7 @@ sortmerna:
   extra_settings: "--num_alignments 1"
 
 # remove duplicates from bam files?
-remove_duplicates: True
+remove_duplicates: False
 
 assembly:
   # run Megahit assembler?