mskcc · anoronh4 · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 21, 2023
diff --git a/assets/analysis_multiqc_config.yml b/assets/analysis_multiqc_config.yml
@@ -21,9 +21,19 @@ custom_data:
           title: "Gene Count"
           format: "{:,.0f}"
           description: "Kallisto: Number of genes with detected expression"
+  assess_strandedness:
+    file_format: "tsv"
+    section_name: "Strandedness"
+    description: "Pass/Fail status of sample strandedness based on the input file"
+    plot_type: "table"
+    pconfig:
+      id: "strandedness"
+      namespace: "strandedness_table"
 
 sp:
   htseq_expression_genstats:
     fn: "*.htseq.summary.txt"
   kallisto_expression_genstats:
     fn: "*.kallisto.customsummary.txt"
+  assess_strandedness:
+    fn: "*.strandedness.tsv"
diff --git a/conf/base.config b/conf/base.config
@@ -67,3 +67,5 @@ process {
         cache = false
     }
 }
+
+nextflow.enable.moduleBinaries = true
diff --git a/conf/modules.config b/conf/modules.config
@@ -485,4 +485,16 @@ process {
         ]
     }
 
+    withName: '.*:INFER_STRAND:.*' {
+        publishDir = [
+            enabled:false
+        ]
+    }
+
+    withName: EXTRACTSTRAND {
+        publishDir = [
+            enabled:false
+        ]
+    }
+
 }
diff --git a/modules.json b/modules.json
@@ -131,6 +131,11 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "seqtk/sample": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
                     "star/align": {
                         "branch": "master",
                         "git_sha": "57d75dbac06812c59798a48585032f6e50bb1914",

diff --git a/modules/local/extractstrand/main.nf b/modules/local/extractstrand/main.nf
@@ -0,0 +1,57 @@
+process EXTRACTSTRAND {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "biocontainers::pandas:1.5.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+            'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+            'biocontainers/pandas:1.5.2' }"
+
+    input:
+    tuple val(meta), path(metrics)
+
+    output:
+    tuple val(meta), path("*.strandedness.tsv"), emit: strand
+    path "versions.yml"                       , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    #!/usr/local/bin/python
+
+    import pandas as pd
+
+    df = pd.read_csv("${metrics}", skiprows=(lambda x: x not in [6, 7]), sep="\\t")
+    df = df.drop(columns = [col for col in list(df) if col not in ['NUM_R1_TRANSCRIPT_STRAND_READS','NUM_R2_TRANSCRIPT_STRAND_READS']])
+
+    df['input_strandedness']    = "${meta.auto_strandedness ? "auto" : meta.strandedness}"
+    df['inferred_strandedness'] = df.apply(lambda row: "yes" if row['NUM_R1_TRANSCRIPT_STRAND_READS']/3 >= row['NUM_R2_TRANSCRIPT_STRAND_READS'] else "reverse" if row['NUM_R2_TRANSCRIPT_STRAND_READS']/3 >= row['NUM_R1_TRANSCRIPT_STRAND_READS'] else "no", axis=1)
+    df['input_strand_correct']  = df.apply(lambda row: True if "${meta.strandedness}" == row["inferred_strandedness"] else False, axis=1)
+    df.index                    = ['${meta.id}']
+
+    desired_column_order = ['input_strandedness', 'inferred_strandedness', 'input_strand_correct','NUM_R1_TRANSCRIPT_STRAND_READS','NUM_R2_TRANSCRIPT_STRAND_READS']
+    df = df[desired_column_order]
+
+    df.to_csv("${prefix}.strandedness.tsv",sep="\\t", index=True)
+
+    with open("versions.yml", 'w') as f:
+        f.write("${task.process}:\\n")
+        f.write("    pandas: 1.5.2\\n")
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    #!/usr/bin/python
+
+    with open("${prefix}.strandedness.txt", 'w') as f:
+        pass
+
+    with open("versions.yml", 'w') as f:
+        f.write("${task.process}:\\n")
+        f.write("    pandas:1.5.2\\n")
+    """
+}
diff --git a/modules/nf-core/seqtk/sample/environment.yml b/modules/nf-core/seqtk/sample/environment.yml
diff --git a/modules/nf-core/seqtk/sample/main.nf b/modules/nf-core/seqtk/sample/main.nf
diff --git a/modules/nf-core/seqtk/sample/meta.yml b/modules/nf-core/seqtk/sample/meta.yml
diff --git a/subworkflows/local/infer_strand.nf b/subworkflows/local/infer_strand.nf
@@ -0,0 +1,78 @@
+include { SEQTK_SAMPLE                } from '../../modules/nf-core/seqtk/sample/main'
+include { PREPROCESS_READS            } from './preprocess_reads'
+include { PICARD_COLLECTRNASEQMETRICS } from '../../modules/nf-core/picard/collectrnaseqmetrics/main'
+include { EXTRACTSTRAND               } from '../../modules/local/extractstrand/main'
+include { STAR_ALIGN                  } from '../../modules/nf-core/star/align/main'
+include { GROUP_READS                 } from './group_reads'
+
+workflow INFER_STRAND {
+
+    take:
+    reads
+    star_index
+    gtf
+    refflat
+    fasta
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    reads_branch = reads
+        .branch{meta, reads ->
+            auto: meta.auto_strandedness
+            other: true
+        }
+
+    GROUP_READS(reads_branch.auto)
+
+    SEQTK_SAMPLE(
+        GROUP_READS.out.grouped_reads
+            .map{ meta, reads ->
+                [ meta, meta.single_end ? [reads[0]] : [reads[0], reads[1]], 50000 ]
+            }
+    )
+    ch_versions = ch_versions.mix(SEQTK_SAMPLE.out.versions.first())
+
+    PREPROCESS_READS(SEQTK_SAMPLE.out.reads)
+    ch_versions = ch_versions.mix(PREPROCESS_READS.out.ch_versions.first())
+
+    STAR_ALIGN(
+        PREPROCESS_READS.out.reads_untrimmed,
+        star_index,
+        gtf,
+        false,
+        [],
+        []
+    )
+    ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first())
+
+    PICARD_COLLECTRNASEQMETRICS(
+        STAR_ALIGN.out.bam,
+        refflat,
+        fasta,
+        []
+    )
+    ch_versions = ch_versions.mix(PICARD_COLLECTRNASEQMETRICS.out.versions.first())
+
+    EXTRACTSTRAND(PICARD_COLLECTRNASEQMETRICS.out.metrics)
+
+    amended_reads = EXTRACTSTRAND.out.strand
+        .map{meta, strand_txt ->
+            [ meta["sample"], strand_txt ]
+        }.join(
+            reads.map{ meta, reads ->
+                [ meta["sample"], meta, reads ]
+            }, by:[0]
+        ).map{ sample, strand_txt, meta, reads ->
+            def new_meta = meta.clone()
+            new_meta["strandedness"] = strand_txt.readLines()[1].split("\\t")[2]
+
+            [new_meta, reads]
+        }.mix( reads_branch.other )
+
+    emit:
+    reads       = amended_reads
+    ch_versions = ch_versions
+
+}
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -52,8 +52,9 @@ def create_fastq_channel(LinkedHashMap row) {
     try {
         meta.umi2 = meta.umi2.toInteger() * "N"
     } catch(Exception e) { }
-    meta.strandedness = row.strand ? (row.strand.trim() == "" ? "no" : row.strand.trim()) : "no"
-    if (! ["yes","no","reverse"].contains(meta.strandedness)){
+    meta.strandedness = row.strand ? (row.strand.trim() == "" ? "auto" : row.strand.trim()) : "auto"
+    meta.auto_strandedness = meta.strandedness == "auto" ? true : false
+    if (! ["yes","no","reverse","auto"].contains(meta.strandedness)){
         exit 1, "ERROR: Please check input samplesheet -> strand value is invalid!\n${row.strand ? row.strand : ""}"
     }
 

diff --git a/subworkflows/local/preprocess_reads.nf b/subworkflows/local/preprocess_reads.nf
@@ -21,7 +21,7 @@ workflow PREPROCESS_READS {
         .map{ meta, reads ->
             def meta_clone = meta.clone()
             if (params.extract_fq_read_group) {
-                def rg_map = Utils.flowcellLaneFromFastq(reads[0])
+                def rg_map = Utils.flowcellLaneFromFastq([reads].flatten()[0])
                 meta_clone.read_group = "${meta.sample}@${rg_map["fcid"]}@${rg_map["lane"]}@${meta.fastq_pair_id}"
                 meta_clone.id = meta_clone.read_group
             } else {

diff --git a/subworkflows/local/qc.nf b/subworkflows/local/qc.nf
@@ -7,6 +7,8 @@ include {
     MULTIQC as MULTIQC_COLLECT
 } from '../../modules/nf-core/multiqc/main'
 include { BAM_RSEQC                   } from '../nf-core/bam_rseqc/main'
+include { EXTRACTSTRAND               } from '../../modules/local/extractstrand/main'
+
 
 workflow QC {
 
@@ -40,6 +42,11 @@ workflow QC {
     )
     ch_versions = ch_versions.mix(PICARD_COLLECTRNASEQMETRICS.out.versions.first())
 
+    EXTRACTSTRAND(
+        PICARD_COLLECTRNASEQMETRICS.out.metrics
+    )
+    ch_versions = ch_versions.mix(EXTRACTSTRAND.out.versions.first())
+
     PICARD_COLLECTHSMETRICS(
         bam
             .filter{ meta, bam ->
@@ -58,6 +65,7 @@ workflow QC {
 
     multiqc_files = multiqc_files
         .mix(PICARD_COLLECTRNASEQMETRICS.out.metrics)
+        .mix(EXTRACTSTRAND.out.strand)
         .mix(PICARD_COLLECTHSMETRICS.out.metrics)
         .mix(BAM_RSEQC.out.bamstat_txt)
         .mix(BAM_RSEQC.out.innerdistance_freq)

diff --git a/workflows/forte.nf b/workflows/forte.nf
@@ -50,6 +50,7 @@ include { BAIT_INPUTS     } from '../subworkflows/local/baits'
 //
 include { CUSTOM_DUMPSOFTWAREVERSIONS       } from '../modules/nf-core/custom/dumpsoftwareversions/main'
 include { PREPARE_REFERENCES                } from '../subworkflows/local/prepare_references'
+include { INFER_STRAND                      } from '../subworkflows/local/infer_strand'
 include { PREPROCESS_READS                  } from '../subworkflows/local/preprocess_reads'
 include { ALIGN_READS                       } from '../subworkflows/local/align_reads'
 include { MULTIQC                           } from '../modules/nf-core/multiqc/main'
@@ -91,9 +92,16 @@ workflow FORTE {
     PREPARE_REFERENCES()
     ch_versions = ch_versions.mix(PREPARE_REFERENCES.out.ch_versions)
 
+    INFER_STRAND(
+        INPUT_CHECK.out.reads,
+        PREPARE_REFERENCES.out.star_index,
+        PREPARE_REFERENCES.out.gtf,
+        PREPARE_REFERENCES.out.refflat,
+        params.fasta
+    )
 
     PREPROCESS_READS(
-        INPUT_CHECK.out.reads
+        INFER_STRAND.out.reads
     )
     ch_versions = ch_versions.mix(PREPROCESS_READS.out.ch_versions)