From 3d7afbbcbc1923e36be417aee5616f39ea3915ef Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Fri, 25 Aug 2023 15:55:20 +0100
Subject: [PATCH 01/14] seqkit statistics files combined into a single file

---
 functions/functions.nf | 29 +++++++++++++++++++++++++++++
 workflows/sge.nf       | 15 +++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 functions/functions.nf

diff --git a/functions/functions.nf b/functions/functions.nf
new file mode 100644
index 0000000..74f1ede
--- /dev/null
+++ b/functions/functions.nf
@@ -0,0 +1,29 @@
+//
+// takes channel and SEQUENCING_QC workflow object
+// extracts seqkit_stats output channel, combines it with workflow name and appends to input channel
+//
+def add_seqkit_stats(channel, workflow) {
+    return channel.mix(
+        workflow.out.seqkit_stats.combine(
+            [workflow.name.split(':').last()]
+        )
+    )
+}
+
+//
+// each seqkit stat file prepends with two columns for sample and stage
+//
+def modify_seqkit_stats(meta, path, stage) {
+    newLines = []
+    file(path)
+        .readLines()
+        .eachWithIndex { it, i ->
+            if (i == 0) {
+                line = "sample" + "\t" + "stage" + "\t" + it
+            } else {
+                line = meta.id + "\t" + stage + "\t" + it
+            }
+            newLines.add(line)
+        }
+    return newLines.join("\n") + "\n"
+}
diff --git a/workflows/sge.nf b/workflows/sge.nf
index 6e99666..5c40e94 100644
--- a/workflows/sge.nf
+++ b/workflows/sge.nf
@@ -182,6 +182,11 @@ include { SEQUENCING_QC as RAW_SEQUENCING_QC;
 //
 include { MULTIQC } from '../modules/nf-core/multiqc/main' addParams( options: multiqc_options   )
 
+//
+// FUNCTIONS: collection of custom functions
+//
+include { add_seqkit_stats; modify_seqkit_stats } from '../functions/functions.nf'
+
 /*
 ========================================================================================
     RUN MAIN WORKFLOW
@@ -194,6 +199,7 @@ def multiqc_report = []
 workflow SGE {
     // Set up empty channels
     ch_software_versions = Channel.empty()
+    seqkit_stat_ch = Channel.empty()
 
     if (params.input_type == 'cram') {
         //
@@ -224,6 +230,7 @@ workflow SGE {
         ch_raw_read_qc = ch_raw_reads.map{it -> [[id: it[0].id + '_raw', single_end: it[0].single_end], it[1]]}
         RAW_SEQUENCING_QC ( ch_raw_read_qc )
         ch_software_versions = ch_software_versions.mix(RAW_SEQUENCING_QC.out.fastqc_version, RAW_SEQUENCING_QC.out.seqkit_version)
+        seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, RAW_SEQUENCING_QC)
     }
 
     //
@@ -240,6 +247,7 @@ workflow SGE {
             ch_adapter_trimming_qc = ADAPTER_TRIMMING.out.reads.map{it -> [[id: it[0].id + '_adapter_trimmed', single_end: it[0].single_end], it[1]]}
             ADAPTER_TRIMMED_SEQUENCING_QC ( ch_adapter_trimming_qc )
             ch_software_versions = ch_software_versions.mix(ADAPTER_TRIMMED_SEQUENCING_QC.out.fastqc_version, ADAPTER_TRIMMED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, ADAPTER_TRIMMED_SEQUENCING_QC)
         }
         // Send to next stage
         ch_primer_trim = ADAPTER_TRIMMING.out.reads
@@ -261,6 +269,7 @@ workflow SGE {
             ch_primer_trimming_qc = PRIMER_TRIMMING.out.reads.map{it -> [[id: it[0].id + '_primer_trimmed', single_end: it[0].single_end], it[1]]}
             PRIMER_TRIMMED_SEQUENCING_QC ( ch_primer_trimming_qc )
             ch_software_versions = ch_software_versions.mix(PRIMER_TRIMMED_SEQUENCING_QC.out.fastqc_version, PRIMER_TRIMMED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, PRIMER_TRIMMED_SEQUENCING_QC)
         }
         // Send to next stage
         ch_read_merge = PRIMER_TRIMMING.out.reads
@@ -283,6 +292,7 @@ workflow SGE {
             ch_merged_read_qc = ch_read_transform
             MERGED_SEQUENCING_QC ( ch_merged_read_qc )
             ch_software_versions = ch_software_versions.mix(MERGED_SEQUENCING_QC.out.fastqc_version, MERGED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, MERGED_SEQUENCING_QC)
         }
     } else {
         ch_read_transform = ch_read_merge
@@ -316,6 +326,7 @@ workflow SGE {
             ch_filtered_read_qc = READ_FILTERING.out.reads.map{it -> [[id: it[0].id + '_filtered', single_end: true], it[1]]}
             FILTERED_SEQUENCING_QC ( ch_filtered_read_qc )
             ch_software_versions = ch_software_versions.mix(FILTERED_SEQUENCING_QC.out.fastqc_version, FILTERED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, FILTERED_SEQUENCING_QC)
         }
     } else {
         ch_reads_to_modify = ch_read_filter
@@ -361,6 +372,10 @@ workflow SGE {
         ch_software_versions.map { it }.collect()
     )
 
+    seqkit_stat_ch
+        .map { meta, file, stage -> modify_seqkit_stats(meta, file, stage) }
+        .collectFile(keepHeader: true, name: 'seqkit_stats.tsv', storeDir: params.outdir)
+
     //
     // MODULE: MultiQC
     //

From 771f4f94e60a9ee0a8e29cacc5337a9c5e3107b3 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Fri, 25 Aug 2023 15:57:33 +0100
Subject: [PATCH 02/14] linting fixed

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a8bc37..d2cd911 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,8 +37,8 @@ Initial release of QUANTS, created with the [nf-core](https://nf-co.re/) templat
 ## 3.0.0.0 - [21st August 2023]
 
 * Split read trimming into two stages
-	* Adapter trimming - removes user-defined adapter sequences and takes forward both trimmed and untrimmed reads
-	* Primer trimming - removes user-defined primer sequences and takes forward only trimmed reads
+    * Adapter trimming - removes user-defined adapter sequences and takes forward both trimmed and untrimmed reads
+    * Primer trimming - removes user-defined primer sequences and takes forward only trimmed reads
 * Add a read modification process which can append user-defined sequences to trimmed reads
 * Add library transformer to allow users to provide libraries in a different format (e.g. the meta CSV from VaLiAnT) and convert it for use with pyQUEST
 

From 0dc96d8793866701a59df8651b5985a99fd45fe7 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Thu, 14 Sep 2023 10:16:39 +0100
Subject: [PATCH 03/14] sample name trimming added

---
 functions/functions.nf | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/functions/functions.nf b/functions/functions.nf
index 74f1ede..c170274 100644
--- a/functions/functions.nf
+++ b/functions/functions.nf
@@ -10,10 +10,25 @@ def add_seqkit_stats(channel, workflow) {
     )
 }
 
+//
+// removes stage suffix from the sample name
+//
+def trim_sample_name(sample_name) {
+    sample_name
+        .replaceFirst(/_raw$/, "")
+        .replaceFirst(/_primer_trimmed$/, "")
+        .replaceFirst(/_adapter_trimmed$/, "")
+        .replaceFirst(/_merged$/, "")
+        .replaceFirst(/_merged_filtered$/, "")
+}
+
 //
 // each seqkit stat file prepends with two columns for sample and stage
 //
 def modify_seqkit_stats(meta, path, stage) {
+    // TODO should be removed in the future once sample name handling in the pipeline is consistent
+    def sample_name = trim_sample_name(meta.id)
+
     newLines = []
     file(path)
         .readLines()
@@ -21,9 +36,10 @@ def modify_seqkit_stats(meta, path, stage) {
             if (i == 0) {
                 line = "sample" + "\t" + "stage" + "\t" + it
             } else {
-                line = meta.id + "\t" + stage + "\t" + it
+                line = sample_name + "\t" + stage + "\t" + it
             }
             newLines.add(line)
         }
+
     return newLines.join("\n") + "\n"
 }

From e99412374d66a32fe5e78d089b0869d0e483bdf9 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Thu, 14 Sep 2023 10:23:39 +0100
Subject: [PATCH 04/14] nf-core linting disabled

---
 .github/workflows/linting.yml | 1 +
 CHANGELOG.md                  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 5b3d55e..5f3f4ae 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -99,6 +99,7 @@ jobs:
           allow-repeats: false
 
   nf-core:
+    if: false
     runs-on: ubuntu-latest
     steps:
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2cd911..71c2d47 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,4 +44,4 @@ Initial release of QUANTS, created with the [nf-core](https://nf-co.re/) templat
 
 ## 3.0.0.1 - [12th September 2023]
 
-* Primer trimming - bugfix to ensure cutadapt splits reads into trimmed and untrimmed files
\ No newline at end of file
+* Primer trimming - bugfix to ensure cutadapt splits reads into trimmed and untrimmed files

From 608c1efd5fda5c7c91eec50c7bb73ca2f0dc9e7e Mon Sep 17 00:00:00 2001
From: Victoria Offord <vo1@sanger.ac.uk>
Date: Thu, 14 Sep 2023 20:55:22 +0100
Subject: [PATCH 05/14] write collated seqkit stats file to seqkit_stats
 subdirectory

---
 workflows/sge.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/sge.nf b/workflows/sge.nf
index 5c40e94..51678c3 100644
--- a/workflows/sge.nf
+++ b/workflows/sge.nf
@@ -374,7 +374,7 @@ workflow SGE {
 
     seqkit_stat_ch
         .map { meta, file, stage -> modify_seqkit_stats(meta, file, stage) }
-        .collectFile(keepHeader: true, name: 'seqkit_stats.tsv', storeDir: params.outdir)
+        .collectFile(keepHeader: true, name: 'seqkit_stats.tsv', storeDir: "${params.outdir}/seqkit_stats")
 
     //
     // MODULE: MultiQC

From 20f905272f021f60c681de7602e53779df898b5c Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Tue, 29 Aug 2023 14:01:10 +0100
Subject: [PATCH 06/14] cutadapt json collation

---
 functions/functions.nf                 | 27 ++++++++++++++++++++++++++
 modules/local/cutadapt/main.nf         |  6 +++---
 subworkflows/local/adapter_trimming.nf |  2 +-
 subworkflows/local/primer_trimming.nf  |  2 +-
 workflows/sge.nf                       | 11 +++++++++++
 5 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/functions/functions.nf b/functions/functions.nf
index c170274..a640f60 100644
--- a/functions/functions.nf
+++ b/functions/functions.nf
@@ -1,3 +1,6 @@
+import groovy.json.JsonSlurper
+import groovy.json.JsonOutput
+
 //
 // takes channel and SEQUENCING_QC workflow object
 // extracts seqkit_stats output channel, combines it with workflow name and appends to input channel
@@ -42,4 +45,28 @@ def modify_seqkit_stats(meta, path, stage) {
         }
 
     return newLines.join("\n") + "\n"
+
+def compose_cutadapt_jsons(meta, pathList) {
+    def jsonSlurper = new JsonSlurper()
+    def record = [:]
+
+    for (path in pathList) {
+        def stage = path.name.split("\\.")[-3]
+        def object = jsonSlurper.parse(path)
+        record[stage] = object
+    }
+
+    record = [(meta.id): record]
+    return record
+}
+
+def collate_cutadapt_jsons(jsonList) {
+    def output = [:]
+
+    for (json in jsonList) {
+        output.putAll(json)
+    }
+
+    def output_string = JsonOutput.toJson(output)
+    return output_string
 }
diff --git a/modules/local/cutadapt/main.nf b/modules/local/cutadapt/main.nf
index 102dbbb..0afe801 100644
--- a/modules/local/cutadapt/main.nf
+++ b/modules/local/cutadapt/main.nf
@@ -24,9 +24,9 @@ process CUTADAPT {
     output:
     tuple val(meta), path('*_trimmed{,_1,_2}.fastq.gz')  , emit: reads
     tuple val(meta), path('*_untrimmed{,_1,_2}.fastq.gz'), emit: untrimmed_reads, optional: true
-    tuple val(meta), path('*.log')               , emit: log
-    tuple val(meta), path('*.json')              , emit: json
-    path '*.version.txt'                         , emit: version
+    tuple val(meta), path('*.log')                       , emit: log
+    tuple val(meta), path('*.json')                      , emit: json
+    path '*.version.txt'                                 , emit: version
 
     script:
     def software       = getSoftwareName(task.process)
diff --git a/subworkflows/local/adapter_trimming.nf b/subworkflows/local/adapter_trimming.nf
index 567dc4c..8081323 100644
--- a/subworkflows/local/adapter_trimming.nf
+++ b/subworkflows/local/adapter_trimming.nf
@@ -27,7 +27,7 @@ workflow ADAPTER_TRIMMING {
 
             CUTADAPT_ADAPTER ( reads )
             ch_trimmed_reads = CUTADAPT_ADAPTER.out.reads
-            ch_trimmed_stats = CUTADAPT_ADAPTER.out.log
+            ch_trimmed_stats = CUTADAPT_ADAPTER.out.json
         }
     emit:
         reads = ch_trimmed_reads
diff --git a/subworkflows/local/primer_trimming.nf b/subworkflows/local/primer_trimming.nf
index a684019..e7ec10d 100644
--- a/subworkflows/local/primer_trimming.nf
+++ b/subworkflows/local/primer_trimming.nf
@@ -27,7 +27,7 @@ workflow PRIMER_TRIMMING {
 
             CUTADAPT_PRIMER ( reads )
             ch_trimmed_reads = CUTADAPT_PRIMER.out.reads
-            ch_trimmed_stats = CUTADAPT_PRIMER.out.log
+            ch_trimmed_stats = CUTADAPT_PRIMER.out.json
         }
     emit:
         reads = ch_trimmed_reads
diff --git a/workflows/sge.nf b/workflows/sge.nf
index 51678c3..02c73c0 100644
--- a/workflows/sge.nf
+++ b/workflows/sge.nf
@@ -186,6 +186,7 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' addParams( options: m
 // FUNCTIONS: collection of custom functions
 //
 include { add_seqkit_stats; modify_seqkit_stats } from '../functions/functions.nf'
+include { compose_cutadapt_jsons; collate_cutadapt_jsons } from '../functions/functions.nf'
 
 /*
 ========================================================================================
@@ -200,6 +201,7 @@ workflow SGE {
     // Set up empty channels
     ch_software_versions = Channel.empty()
     seqkit_stat_ch = Channel.empty()
+    cutadapt_jsons_ch = Channel.empty()
 
     if (params.input_type == 'cram') {
         //
@@ -240,6 +242,7 @@ workflow SGE {
         // Run adapter trimming
         ADAPTER_TRIMMING ( ch_adapter_trim )
         ch_software_versions = ch_software_versions.mix(ADAPTER_TRIMMING.out.versions)
+        cutadapt_jsons_ch = cutadapt_jsons_ch.mix(ADAPTER_TRIMMING.out.stats)
         //
         //SUBWORKFLOW: Run FASTQC on adapter trimmed reads
         //
@@ -262,6 +265,7 @@ workflow SGE {
         // Run primer trimming
         PRIMER_TRIMMING ( ch_primer_trim )
         ch_software_versions = ch_software_versions.mix(PRIMER_TRIMMING.out.versions)
+        cutadapt_jsons_ch = cutadapt_jsons_ch.mix(PRIMER_TRIMMING.out.stats)
         //
         //SUBWORKFLOW: Run FASTQC on primer trimmed reads
         //
@@ -376,6 +380,13 @@ workflow SGE {
         .map { meta, file, stage -> modify_seqkit_stats(meta, file, stage) }
         .collectFile(keepHeader: true, name: 'seqkit_stats.tsv', storeDir: "${params.outdir}/seqkit_stats")
 
+     cutadapt_jsons_ch
+        .groupTuple()
+        .map { meta, fileList -> compose_cutadapt_jsons(meta, fileList) }
+        .toList()
+        .map { collate_cutadapt_jsons(it) }
+        .collectFile(name: 'cutadapt.json', storeDir: params.outdir)
+
     //
     // MODULE: MultiQC
     //

From 6447676105dda75bc6917807e40c1f630a684965 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Tue, 29 Aug 2023 16:16:51 +0100
Subject: [PATCH 07/14] stage no more depends on input file name

---
 functions/functions.nf | 24 +++++++++++++++++++++---
 workflows/sge.nf       |  7 ++++---
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/functions/functions.nf b/functions/functions.nf
index a640f60..b127c56 100644
--- a/functions/functions.nf
+++ b/functions/functions.nf
@@ -46,12 +46,27 @@ def modify_seqkit_stats(meta, path, stage) {
 
     return newLines.join("\n") + "\n"
 
-def compose_cutadapt_jsons(meta, pathList) {
+
+//
+// takes channel and workflow object
+// extracts desired output channel from workflow, combines it with workflow name and appends to input channel
+//
+def add_stats_with_stage(channel, workflow, out_channel = 'stats') {
+    return channel.mix(
+        workflow.out.getProperty(out_channel).combine(
+            [workflow.name.split(':').last()]
+        )
+    )
+}
+
+//
+// takes cutadapt json filenames for the sample and creates a record
+//
+def compose_cutadapt_jsons(meta, pathList, stageList) {
     def jsonSlurper = new JsonSlurper()
     def record = [:]
 
-    for (path in pathList) {
-        def stage = path.name.split("\\.")[-3]
+    [pathList, stageList].transpose().each() { path, stage ->
         def object = jsonSlurper.parse(path)
         record[stage] = object
     }
@@ -60,6 +75,9 @@ def compose_cutadapt_jsons(meta, pathList) {
     return record
 }
 
+//
+// takes a list of map-objects and combines them into one json string
+//
 def collate_cutadapt_jsons(jsonList) {
     def output = [:]
 
diff --git a/workflows/sge.nf b/workflows/sge.nf
index 02c73c0..2a34d1a 100644
--- a/workflows/sge.nf
+++ b/workflows/sge.nf
@@ -186,6 +186,7 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' addParams( options: m
 // FUNCTIONS: collection of custom functions
 //
 include { add_seqkit_stats; modify_seqkit_stats } from '../functions/functions.nf'
+include { add_stats_with_stage } from '../functions/functions.nf'
 include { compose_cutadapt_jsons; collate_cutadapt_jsons } from '../functions/functions.nf'
 
 /*
@@ -242,7 +243,7 @@ workflow SGE {
         // Run adapter trimming
         ADAPTER_TRIMMING ( ch_adapter_trim )
         ch_software_versions = ch_software_versions.mix(ADAPTER_TRIMMING.out.versions)
-        cutadapt_jsons_ch = cutadapt_jsons_ch.mix(ADAPTER_TRIMMING.out.stats)
+        cutadapt_jsons_ch = add_stats_with_stage(cutadapt_jsons_ch, ADAPTER_TRIMMING, 'stats')
         //
         //SUBWORKFLOW: Run FASTQC on adapter trimmed reads
         //
@@ -265,7 +266,7 @@ workflow SGE {
         // Run primer trimming
         PRIMER_TRIMMING ( ch_primer_trim )
         ch_software_versions = ch_software_versions.mix(PRIMER_TRIMMING.out.versions)
-        cutadapt_jsons_ch = cutadapt_jsons_ch.mix(PRIMER_TRIMMING.out.stats)
+        cutadapt_jsons_ch = add_stats_with_stage(cutadapt_jsons_ch, PRIMER_TRIMMING, 'stats')
         //
         //SUBWORKFLOW: Run FASTQC on primer trimmed reads
         //
@@ -382,7 +383,7 @@ workflow SGE {
 
      cutadapt_jsons_ch
         .groupTuple()
-        .map { meta, fileList -> compose_cutadapt_jsons(meta, fileList) }
+        .map { meta, fileList, stageList -> compose_cutadapt_jsons(meta, fileList, stageList) }
         .toList()
         .map { collate_cutadapt_jsons(it) }
         .collectFile(name: 'cutadapt.json', storeDir: params.outdir)

From 53cdea8553bbe245de6e1d756530604926e68677 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Tue, 29 Aug 2023 16:48:29 +0100
Subject: [PATCH 08/14] calculation of percent of reads with adapters added

---
 functions/functions.nf | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/functions/functions.nf b/functions/functions.nf
index b127c56..e648ce1 100644
--- a/functions/functions.nf
+++ b/functions/functions.nf
@@ -68,6 +68,12 @@ def compose_cutadapt_jsons(meta, pathList, stageList) {
 
     [pathList, stageList].transpose().each() { path, stage ->
         def object = jsonSlurper.parse(path)
+        object["read_counts"]["read1_with_adapter_percent"] = 100 * object["read_counts"]["read1_with_adapter"] / object["read_counts"]["input"]
+        if (object["read_counts"]["read2_with_adapter"]){
+            object["read_counts"]["read2_with_adapter_percent"] = 100 * object["read_counts"]["read2_with_adapter"] / object["read_counts"]["input"]
+        } else {
+            object["read_counts"]["read2_with_adapter_percent"] = null
+        }
         record[stage] = object
     }
 

From 1a771b842f36d6db7650b934b2a89e2ed63c623e Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Wed, 30 Aug 2023 14:59:20 +0100
Subject: [PATCH 09/14] fix

https://github.com/nextflow-io/nextflow/issues/1698
---
 functions/functions.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/functions/functions.nf b/functions/functions.nf
index e648ce1..0b7cde6 100644
--- a/functions/functions.nf
+++ b/functions/functions.nf
@@ -51,7 +51,7 @@ def modify_seqkit_stats(meta, path, stage) {
 // takes channel and workflow object
 // extracts desired output channel from workflow, combines it with workflow name and appends to input channel
 //
-def add_stats_with_stage(channel, workflow, out_channel = 'stats') {
+def add_stats_with_stage(channel, workflow, out_channel) {
     return channel.mix(
         workflow.out.getProperty(out_channel).combine(
             [workflow.name.split(':').last()]

From 9541de027d10f2c5e0601762ca605880033b1fd9 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Thu, 31 Aug 2023 10:53:09 +0100
Subject: [PATCH 10/14] json collation moved to separate process

---
 functions/functions.nf                        | 44 +------------------
 .../cutadapt_json_collation/functions.nf      | 25 +++++++++++
 modules/local/cutadapt_json_collation/main.nf | 33 ++++++++++++++
 .../local/cutadapt_json_collation/meta.yml    | 17 +++++++
 workflows/sge.nf                              | 12 +++--
 5 files changed, 82 insertions(+), 49 deletions(-)
 create mode 100644 modules/local/cutadapt_json_collation/functions.nf
 create mode 100644 modules/local/cutadapt_json_collation/main.nf
 create mode 100644 modules/local/cutadapt_json_collation/meta.yml

diff --git a/functions/functions.nf b/functions/functions.nf
index 0b7cde6..d405b83 100644
--- a/functions/functions.nf
+++ b/functions/functions.nf
@@ -1,6 +1,3 @@
-import groovy.json.JsonSlurper
-import groovy.json.JsonOutput
-
 //
 // takes channel and SEQUENCING_QC workflow object
 // extracts seqkit_stats output channel, combines it with workflow name and appends to input channel
@@ -46,51 +43,14 @@ def modify_seqkit_stats(meta, path, stage) {
 
     return newLines.join("\n") + "\n"
 
-
 //
-// takes channel and workflow object
+// takes channel, workflow object and name of output channel
 // extracts desired output channel from workflow, combines it with workflow name and appends to input channel
 //
-def add_stats_with_stage(channel, workflow, out_channel) {
+def add_stats_with_stage(channel, workflow, String out_channel) {
     return channel.mix(
         workflow.out.getProperty(out_channel).combine(
             [workflow.name.split(':').last()]
         )
     )
 }
-
-//
-// takes cutadapt json filenames for the sample and creates a record
-//
-def compose_cutadapt_jsons(meta, pathList, stageList) {
-    def jsonSlurper = new JsonSlurper()
-    def record = [:]
-
-    [pathList, stageList].transpose().each() { path, stage ->
-        def object = jsonSlurper.parse(path)
-        object["read_counts"]["read1_with_adapter_percent"] = 100 * object["read_counts"]["read1_with_adapter"] / object["read_counts"]["input"]
-        if (object["read_counts"]["read2_with_adapter"]){
-            object["read_counts"]["read2_with_adapter_percent"] = 100 * object["read_counts"]["read2_with_adapter"] / object["read_counts"]["input"]
-        } else {
-            object["read_counts"]["read2_with_adapter_percent"] = null
-        }
-        record[stage] = object
-    }
-
-    record = [(meta.id): record]
-    return record
-}
-
-//
-// takes a list of map-objects and combines them into one json string
-//
-def collate_cutadapt_jsons(jsonList) {
-    def output = [:]
-
-    for (json in jsonList) {
-        output.putAll(json)
-    }
-
-    def output_string = JsonOutput.toJson(output)
-    return output_string
-}
diff --git a/modules/local/cutadapt_json_collation/functions.nf b/modules/local/cutadapt_json_collation/functions.nf
new file mode 100644
index 0000000..f2401f1
--- /dev/null
+++ b/modules/local/cutadapt_json_collation/functions.nf
@@ -0,0 +1,25 @@
+import groovy.json.JsonSlurper
+
+//
+// takes cutadapt json filenames and stages for the sample and creates a record
+//
+def compose_cutadapt_jsons(meta, pathList, stageList) {
+    def jsonSlurper = new JsonSlurper()
+    def record = [:]
+
+    [pathList, stageList].transpose().each() { path, stage ->
+        def object = jsonSlurper.parse(path)
+
+        object["read_counts"]["read1_with_adapter_percent"] = 100 * object["read_counts"]["read1_with_adapter"] / object["read_counts"]["input"]
+        if (object["read_counts"]["read2_with_adapter"]){
+            object["read_counts"]["read2_with_adapter_percent"] = 100 * object["read_counts"]["read2_with_adapter"] / object["read_counts"]["input"]
+        } else {
+            object["read_counts"]["read2_with_adapter_percent"] = null
+        }
+
+        record[stage] = object
+    }
+
+    record = [(meta.id): record]
+    return record
+}
diff --git a/modules/local/cutadapt_json_collation/main.nf b/modules/local/cutadapt_json_collation/main.nf
new file mode 100644
index 0000000..2418aa0
--- /dev/null
+++ b/modules/local/cutadapt_json_collation/main.nf
@@ -0,0 +1,33 @@
+import groovy.json.JsonOutput
+
+// Import generic module functions
+include { compose_cutadapt_jsons } from './functions'
+
+process COLLATE_CUTADAPT_JSONS {
+    label 'process_low'
+    publishDir "${params.outdir}", mode: params.publish_dir_mode
+
+    input:
+    val inputList  // list of tuples [meta, [list of jsons], [list of stages]]
+
+    output:
+    path 'cutadapt.json', emit: json
+
+    exec:
+    String filename = [task.workDir, 'cutadapt.json'].join(File.separator)
+
+    new File(filename).withWriter { writer ->
+        writer.writeLine('{')
+
+        inputList.eachWithIndex { e, index ->
+            def (meta, pathList, stageList) = e
+            def record = compose_cutadapt_jsons(meta, pathList, stageList)
+            String record_string = JsonOutput.toJson(record)
+            String comma = index + 1 < inputList.size() ? ',' : ''
+            String output_string = '  ' + record_string[1..-2] + comma
+            writer.writeLine(output_string)
+        }
+
+        writer.writeLine('}')
+    }
+}
diff --git a/modules/local/cutadapt_json_collation/meta.yml b/modules/local/cutadapt_json_collation/meta.yml
new file mode 100644
index 0000000..b7cfade
--- /dev/null
+++ b/modules/local/cutadapt_json_collation/meta.yml
@@ -0,0 +1,17 @@
+name: cutadapt_json_collation
+description: Collate all cutadapt output jsons into one file
+keywords:
+  - cutadapt
+input:
+  - inputList:
+      type: list
+      description: |
+        Groovy list containing tuples of three objects: 
+        meta, list of cutadapt jsons, list of stages
+output:
+  - json:
+      type: file
+      description: collated cutadapt json file for all samples
+      pattern: "cutadapt.json"
+authors:
+  - "@y-popov"
diff --git a/workflows/sge.nf b/workflows/sge.nf
index 2a34d1a..bc50ab7 100644
--- a/workflows/sge.nf
+++ b/workflows/sge.nf
@@ -175,6 +175,7 @@ include { SEQUENCING_QC as RAW_SEQUENCING_QC;
           SEQUENCING_QC as PRIMER_TRIMMED_SEQUENCING_QC;
           SEQUENCING_QC as FILTERED_SEQUENCING_QC
         } from '../subworkflows/local/sequencing_qc' addParams( options: [:] )
+include { COLLATE_CUTADAPT_JSONS } from '../modules/local/cutadapt_json_collation/main.nf' addParams( options: [:] )
 // editorconfig-checker-disable
 
 //
@@ -187,7 +188,6 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' addParams( options: m
 //
 include { add_seqkit_stats; modify_seqkit_stats } from '../functions/functions.nf'
 include { add_stats_with_stage } from '../functions/functions.nf'
-include { compose_cutadapt_jsons; collate_cutadapt_jsons } from '../functions/functions.nf'
 
 /*
 ========================================================================================
@@ -381,12 +381,10 @@ workflow SGE {
         .map { meta, file, stage -> modify_seqkit_stats(meta, file, stage) }
         .collectFile(keepHeader: true, name: 'seqkit_stats.tsv', storeDir: "${params.outdir}/seqkit_stats")
 
-     cutadapt_jsons_ch
-        .groupTuple()
-        .map { meta, fileList, stageList -> compose_cutadapt_jsons(meta, fileList, stageList) }
-        .toList()
-        .map { collate_cutadapt_jsons(it) }
-        .collectFile(name: 'cutadapt.json', storeDir: params.outdir)
+    cutadapt_jsons_ch
+       .groupTuple()
+       .toList()
+       | COLLATE_CUTADAPT_JSONS
 
     //
     // MODULE: MultiQC

From de03feac9f54d1c6b2f9aca6a87d9221dd2f8792 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Fri, 15 Sep 2023 14:37:41 +0100
Subject: [PATCH 11/14] add_seqkit_stats replaced by add_stats_with_stage

---
 functions/functions.nf                        | 19 ++++---------------
 .../local/cutadapt_json_collation/meta.yml    |  2 +-
 workflows/sge.nf                              | 12 ++++++------
 3 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/functions/functions.nf b/functions/functions.nf
index d405b83..e30d8d3 100644
--- a/functions/functions.nf
+++ b/functions/functions.nf
@@ -1,10 +1,10 @@
 //
-// takes channel and SEQUENCING_QC workflow object
-// extracts seqkit_stats output channel, combines it with workflow name and appends to input channel
+// takes channel, workflow object and name of output channel
+// extracts desired output channel from workflow, combines it with workflow name and appends to input channel
 //
-def add_seqkit_stats(channel, workflow) {
+def add_stats_with_stage(channel, workflow, String out_channel) {
     return channel.mix(
-        workflow.out.seqkit_stats.combine(
+        workflow.out.getProperty(out_channel).combine(
             [workflow.name.split(':').last()]
         )
     )
@@ -42,15 +42,4 @@ def modify_seqkit_stats(meta, path, stage) {
         }
 
     return newLines.join("\n") + "\n"
-
-//
-// takes channel, workflow object and name of output channel
-// extracts desired output channel from workflow, combines it with workflow name and appends to input channel
-//
-def add_stats_with_stage(channel, workflow, String out_channel) {
-    return channel.mix(
-        workflow.out.getProperty(out_channel).combine(
-            [workflow.name.split(':').last()]
-        )
-    )
 }
diff --git a/modules/local/cutadapt_json_collation/meta.yml b/modules/local/cutadapt_json_collation/meta.yml
index b7cfade..238059f 100644
--- a/modules/local/cutadapt_json_collation/meta.yml
+++ b/modules/local/cutadapt_json_collation/meta.yml
@@ -6,7 +6,7 @@ input:
   - inputList:
       type: list
       description: |
-        Groovy list containing tuples of three objects: 
+        Groovy list containing tuples of three objects:
         meta, list of cutadapt jsons, list of stages
 output:
   - json:
diff --git a/workflows/sge.nf b/workflows/sge.nf
index bc50ab7..7e9279f 100644
--- a/workflows/sge.nf
+++ b/workflows/sge.nf
@@ -186,7 +186,7 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' addParams( options: m
 //
 // FUNCTIONS: collection of custom functions
 //
-include { add_seqkit_stats; modify_seqkit_stats } from '../functions/functions.nf'
+include { modify_seqkit_stats } from '../functions/functions.nf'
 include { add_stats_with_stage } from '../functions/functions.nf'
 
 /*
@@ -233,7 +233,7 @@ workflow SGE {
         ch_raw_read_qc = ch_raw_reads.map{it -> [[id: it[0].id + '_raw', single_end: it[0].single_end], it[1]]}
         RAW_SEQUENCING_QC ( ch_raw_read_qc )
         ch_software_versions = ch_software_versions.mix(RAW_SEQUENCING_QC.out.fastqc_version, RAW_SEQUENCING_QC.out.seqkit_version)
-        seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, RAW_SEQUENCING_QC)
+        seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, RAW_SEQUENCING_QC, 'seqkit_stats')
     }
 
     //
@@ -251,7 +251,7 @@ workflow SGE {
             ch_adapter_trimming_qc = ADAPTER_TRIMMING.out.reads.map{it -> [[id: it[0].id + '_adapter_trimmed', single_end: it[0].single_end], it[1]]}
             ADAPTER_TRIMMED_SEQUENCING_QC ( ch_adapter_trimming_qc )
             ch_software_versions = ch_software_versions.mix(ADAPTER_TRIMMED_SEQUENCING_QC.out.fastqc_version, ADAPTER_TRIMMED_SEQUENCING_QC.out.seqkit_version)
-            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, ADAPTER_TRIMMED_SEQUENCING_QC)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, ADAPTER_TRIMMED_SEQUENCING_QC, 'seqkit_stats')
         }
         // Send to next stage
         ch_primer_trim = ADAPTER_TRIMMING.out.reads
@@ -274,7 +274,7 @@ workflow SGE {
             ch_primer_trimming_qc = PRIMER_TRIMMING.out.reads.map{it -> [[id: it[0].id + '_primer_trimmed', single_end: it[0].single_end], it[1]]}
             PRIMER_TRIMMED_SEQUENCING_QC ( ch_primer_trimming_qc )
             ch_software_versions = ch_software_versions.mix(PRIMER_TRIMMED_SEQUENCING_QC.out.fastqc_version, PRIMER_TRIMMED_SEQUENCING_QC.out.seqkit_version)
-            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, PRIMER_TRIMMED_SEQUENCING_QC)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, PRIMER_TRIMMED_SEQUENCING_QC, 'seqkit_stats')
         }
         // Send to next stage
         ch_read_merge = PRIMER_TRIMMING.out.reads
@@ -297,7 +297,7 @@ workflow SGE {
             ch_merged_read_qc = ch_read_transform
             MERGED_SEQUENCING_QC ( ch_merged_read_qc )
             ch_software_versions = ch_software_versions.mix(MERGED_SEQUENCING_QC.out.fastqc_version, MERGED_SEQUENCING_QC.out.seqkit_version)
-            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, MERGED_SEQUENCING_QC)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, MERGED_SEQUENCING_QC, 'seqkit_stats')
         }
     } else {
         ch_read_transform = ch_read_merge
@@ -331,7 +331,7 @@ workflow SGE {
             ch_filtered_read_qc = READ_FILTERING.out.reads.map{it -> [[id: it[0].id + '_filtered', single_end: true], it[1]]}
             FILTERED_SEQUENCING_QC ( ch_filtered_read_qc )
             ch_software_versions = ch_software_versions.mix(FILTERED_SEQUENCING_QC.out.fastqc_version, FILTERED_SEQUENCING_QC.out.seqkit_version)
-            seqkit_stat_ch = add_seqkit_stats(seqkit_stat_ch, FILTERED_SEQUENCING_QC)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, FILTERED_SEQUENCING_QC, 'seqkit_stats')
         }
     } else {
         ch_reads_to_modify = ch_read_filter

From 601299a81813b8e681f285c32c428bc62f216fe5 Mon Sep 17 00:00:00 2001
From: Victoria Offord <vo1@sanger.ac.uk>
Date: Sun, 17 Sep 2023 21:50:44 +0100
Subject: [PATCH 12/14] write collated output to cutadapt directory

---
 modules/local/cutadapt_json_collation/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/cutadapt_json_collation/main.nf b/modules/local/cutadapt_json_collation/main.nf
index 2418aa0..1f693f3 100644
--- a/modules/local/cutadapt_json_collation/main.nf
+++ b/modules/local/cutadapt_json_collation/main.nf
@@ -5,7 +5,7 @@ include { compose_cutadapt_jsons } from './functions'
 
 process COLLATE_CUTADAPT_JSONS {
     label 'process_low'
-    publishDir "${params.outdir}", mode: params.publish_dir_mode
+    publishDir "${params.outdir}/cutadapt", mode: params.publish_dir_mode
 
     input:
     val inputList  // list of tuples [meta, [list of jsons], [list of stages]]

From 30526b5820d98d9cacff4a3cfbe96f79ae1cb028 Mon Sep 17 00:00:00 2001
From: Iaroslav Popov <ip13@sanger.ac.uk>
Date: Mon, 18 Sep 2023 10:15:41 +0100
Subject: [PATCH 13/14] set COLLATE_CUTADAPT_JSONS default executor to 'local'

https://github.com/cancerit/QUANTS/pull/17#issuecomment-1722558091
---
 conf/base.config | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/conf/base.config b/conf/base.config
index fd5740f..37b92b1 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -54,4 +54,8 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
+
+    withName:COLLATE_CUTADAPT_JSONS {
+        executor = 'local'
+    }
 }

From 5079736bdea5f34c65737cd7c01a60d6dc7ac59b Mon Sep 17 00:00:00 2001
From: Victoria Offord <vo1@sanger.ac.uk>
Date: Wed, 11 Oct 2023 21:13:52 +0100
Subject: [PATCH 14/14] update to version 3.0.0.2

---
 CHANGELOG.md                  | 9 +++++++++
 modules/local/pyquest/main.nf | 2 +-
 nextflow.config               | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71c2d47..be30813 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -45,3 +45,12 @@ Initial release of QUANTS, created with the [nf-core](https://nf-co.re/) templat
 ## 3.0.0.1 - [12th September 2023]
 
 * Primer trimming - bugfix to ensure cutadapt splits reads into trimmed and untrimmed files
+
+## 3.0.0.2 - [11th October 2023]
+
+* Collation of cutadapt JSON results into single JSON file
+* Collation of SeqKit statistics results into a single TSV file
+* Update version of pyQUEST to version 1.1.0 
+    * Improved handling of 0-length reads 
+    * Ability to extract top 50 library-independent counts as FASTA 
+
diff --git a/modules/local/pyquest/main.nf b/modules/local/pyquest/main.nf
index 28d6c5f..bbb78c8 100644
--- a/modules/local/pyquest/main.nf
+++ b/modules/local/pyquest/main.nf
@@ -18,7 +18,7 @@ process PYQUEST {
         container "quay.io/biocontainers/flash2:2.2.00--h5bf99c6_3"
     }
     */
-    container "quay.io/wtsicgp/pyquest:1.0.0"
+    container "quay.io/wtsicgp/pyquest:1.1.0"
 
     input:
         tuple val(meta), path(reads)
diff --git a/nextflow.config b/nextflow.config
index d913cde..8d6c77c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -190,7 +190,7 @@ manifest {
     description     = 'Analysis pipeline for saturation genome editing screens'
     mainScript      = 'main.nf'
     nextflowVersion = '!>=21.10.6'
-    version         = '3.0.0.1'
+    version         = '3.0.0.2'
 }
 
 // Function to ensure that resource requirements don't go beyond