From 5812f72e1f546aa0e89f20fc1ab6352e4d7fce8a Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 23 Jul 2024 09:45:18 +0200 Subject: [PATCH 01/22] cp cellrangermulti to cellrangermulti_vdj in SCRNASEQ workflow --- workflows/scrnaseq.nf | 70 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 10ced221..2584b928 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -5,6 +5,7 @@ include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevi include { STARSOLO } from '../subworkflows/local/starsolo' include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" include { CELLRANGER_MULTI_ALIGN } from "../subworkflows/local/align_cellrangermulti" +include { CELLRANGER_MULTI_VDJ_ALIGN } from "../subworkflows/local/align_cellrangermulti_vdj" include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" @@ -289,6 +290,75 @@ workflow SCRNASEQ { } + // Run cellrangermulti with immuneprofiling pipeline + if (params.aligner == 'cellrangermulti_vdj') { + + // parse the input data to generate a collected channel per sample, which will have + // the metadata and data for each data-type of every sample. + // then, inside the subworkflow, it can be parsed to manage inputs to the module + ch_fastq + .map { meta, fastqs -> + def parsed_meta = meta.clone() + [ "${meta.feature_type.toString()}": fastqs ] + parsed_meta.options = [:] + + // add an universal key to differentiate from empty channels so that the "&& meta_gex?.options" lines in the module main.nf can work properly + parsed_meta.options['data-available'] = true + + // add cellranger options that are currently handled by pipeline, coming from samplesheet + // the module parses them from the 'gex' options + if (meta.feature_type.toString() == 'gex') { + parsed_meta.options['create-bam'] = true // force bam creation -- param required by cellranger multi + if (meta.expected_cells) { parsed_meta.options['expected-cells'] = meta.expected_cells } + } + + [ parsed_meta.id , parsed_meta ] + } + .groupTuple( by: 0 ) + .map{ sample_id, map_collection -> + // Now we must check if every data possibility taken into account in the .branch() operation + // performed inside the CELLRANGER_MULTI_ALIGN subworkflow are initialized, even with empty files + // This to ensure that the sizes of each data channel is the same, and the the order and the data types + // are used together with its rightful pairs + // + // data.types: gex, vdj, ab, beam, crispr, cmo + + // clone ArrayBag (received from .groupTuple()) to avoid mutating the input + def map_collection_clone = [] + map_collection_clone.addAll(map_collection) + + // generate the expected EMPTY tuple when a data type is not used + // needs to have a collected map like that, so every sample from the samplesheet is analysed one at a time, + // allowing to have multiple samples in the sheet, having all the data-type tuples initialized, + // either empty or populated. It will be branched inside the subworkflow. + if (!map_collection_clone.any{ it.feature_type == 'gex' }) { map_collection_clone.add( [id: sample_id, feature_type: 'gex' , gex: empty_file, options:[:] ] ) } + if (!map_collection_clone.any{ it.feature_type == 'vdj' }) { map_collection_clone.add( [id: sample_id, feature_type: 'vdj' , vdj: empty_file, options:[:] ] ) } + if (!map_collection_clone.any{ it.feature_type == 'ab' }) { map_collection_clone.add( [id: sample_id, feature_type: 'ab' , ab: empty_file, options:[:] ] ) } + if (!map_collection_clone.any{ it.feature_type == 'beam' }) { map_collection_clone.add( [id: sample_id, feature_type: 'beam' , beam: empty_file, options:[:] ] ) } // currently not implemented, the input samplesheet checking will not allow it. + if (!map_collection_clone.any{ it.feature_type == 'crispr' }) { map_collection_clone.add( [id: sample_id, feature_type: 'crispr', crispr: empty_file, options:[:] ] ) } + if (!map_collection_clone.any{ it.feature_type == 'cmo' }) { map_collection_clone.add( [id: sample_id, feature_type: 'cmo' , cmo: empty_file, options:[:] ] ) } + + // return final map + map_collection_clone + } + .set{ ch_cellrangermulti_collected_channel } + + // Run cellranger multi + CELLRANGER_MULTI_ALIGN( + ch_genome_fasta, + ch_filter_gtf, + ch_cellrangermulti_collected_channel, + ch_cellranger_index, + cellranger_vdj_index, + ch_multi_samplesheet + ) + ch_versions = ch_versions.mix(CELLRANGER_MULTI_ALIGN.out.ch_versions) + ch_multiqc_files = ch_multiqc_files.mix( CELLRANGER_MULTI_ALIGN.out.cellrangermulti_out.map{ + meta, outs -> outs.findAll{ it -> it.name == "web_summary.html" } + }) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) + + } + // Run emptydrops calling module if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { From a7a5cd3f6659592c5c66bad080250cd0c044ffb9 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 23 Jul 2024 09:46:10 +0200 Subject: [PATCH 02/22] cp align_cellrangermulti to align_cellrangermulti_vdj in local subworkflows --- .../local/align_cellrangermulti_vdj.nf | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 subworkflows/local/align_cellrangermulti_vdj.nf diff --git a/subworkflows/local/align_cellrangermulti_vdj.nf b/subworkflows/local/align_cellrangermulti_vdj.nf new file mode 100644 index 00000000..f853c001 --- /dev/null +++ b/subworkflows/local/align_cellrangermulti_vdj.nf @@ -0,0 +1,226 @@ +// +// Include modules +// +include { CELLRANGER_MKGTF } from "../../modules/nf-core/cellranger/mkgtf/main.nf" +include { CELLRANGER_MKREF } from "../../modules/nf-core/cellranger/mkref/main.nf" +include { CELLRANGER_MKVDJREF } from "../../modules/nf-core/cellranger/mkvdjref/main.nf" +include { CELLRANGER_MULTI } from "../../modules/nf-core/cellranger/multi/main.nf" +include { PARSE_CELLRANGERMULTI_SAMPLESHEET } from "../../modules/local/parse_cellrangermulti_samplesheet.nf" + +// Define workflow to subset and index a genome region fasta file +workflow CELLRANGER_MULTI_ALIGN { + take: + ch_fasta + ch_gtf + ch_fastq + cellranger_gex_index + cellranger_vdj_index + ch_multi_samplesheet + + main: + ch_versions = Channel.empty() + + // + // TODO: Include checkers for cellranger multi parameter combinations. For example, when VDJ data is given, require VDJ ref. If FFPE, require frna probe sets, etc. + // + + // since we merged all data as a meta, now we have a channel per sample, which + // every item is a meta map for each data-type + // now we can split it back for passing as input to the module + ch_fastq + .flatten() + .map{ meta -> + def meta_clone = meta.clone() + def data_dict = meta_clone.find{ it.key == "${meta_clone.feature_type}" } + fastqs = data_dict?.value + meta_clone.remove( data_dict?.key ) + [ meta_clone, fastqs ] + } + .branch { + meta, fastq -> + gex: meta.feature_type == "gex" + return [ meta, fastq ] + vdj: meta.feature_type == "vdj" + return [ meta, fastq ] + ab: meta.feature_type == "ab" + return [ meta, fastq ] + beam: meta.feature_type == "beam" + return [ meta, fastq ] + crispr: meta.feature_type == "crispr" + return [ meta, fastq ] + cmo: meta.feature_type == "cmo" + return [ meta, fastq ] + } + .set { ch_grouped_fastq } + + // Assign other cellranger reference files + ch_gex_frna_probeset = params.gex_frna_probe_set ? file(params.gex_frna_probe_set) : [] + ch_gex_target_panel = params.gex_target_panel ? file(params.gex_target_panel) : [] + ch_gex_cmo_set = params.gex_cmo_set ? file(params.gex_cmo_set) : [] + ch_gex_barcodes = params.gex_barcode_sample_assignment ? file(params.gex_barcode_sample_assignment) : [] + ch_fb_reference = params.fb_reference ? file(params.fb_reference) : [] + ch_vdj_primer_index = params.vdj_inner_enrichment_primers ? file(params.vdj_inner_enrichment_primers) : [] + ch_beam_antigen_panel_csv = [] // currently not implemented + ch_beam_control_panel_csv = [] // currently not implemented + + // parse frna and barcode information + if (ch_multi_samplesheet) { + + // + // Here, we parse the received cellranger multi barcodes samplesheet. + // We first use the get the PARSE_CELLRANGERMULTI_SAMPLESHEET module to check it and guarantee structure + // and also split it to have one fnra/cmo .csv for each sample. + // + // The selection of the GEX fastqs is because samples are always expected to have at least GEX data. + // Then, using "combined" map, which means, the "additional barcode information" of each sample, we then, + // parse it to generate the cmo / frna samplesheets to be used by each sample. + // + // Here, to guarantee it and take advantage of the "FIFO"-rule and are sure that the data used in the + // module is from the same sample from the "normal" samplesheet. We have to use the .concat().groupTuple() + // pipe instead of .join() because .join() outputs first the arrays that could be joined and afterwards + // the ones with "remainders", thus, we would not ensure "FIFO" and the same order. + // + // To guarantee this, we can define two nf-tests, one having only one sample with CMO and another with two + // samples using CMOs, even if wrongly/repeated, but just to guarantee FIFO is working. + // + + PARSE_CELLRANGERMULTI_SAMPLESHEET( ch_multi_samplesheet ) + + ch_grouped_fastq.gex + .map{ [it[0].id] } + .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.cmo.flatten().map { [ "${it.baseName}" - "_cmo", it ] } ) + .groupTuple() + .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, cmo.csv ] + .set { ch_cmo_barcode_csv } + + ch_grouped_fastq.gex + .map{ [it[0].id] } + .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.frna.flatten().map { [ "${it.baseName}" - "_frna", it ] } ) + .groupTuple() + .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, frna.csv ] + .set { ch_frna_sample_csv } + + } else { + ch_cmo_barcode_csv = [] + ch_frna_sample_csv = [] + } + + // + // Prepare GTF + // + if ( !cellranger_gex_index || (!cellranger_vdj_index && !params.skip_cellrangermulti_vdjref) ) { + + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGER_MKGTF ( ch_gtf ) + ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) + + } + + // + // Prepare gex reference (Normal Ref) + // + if ( !cellranger_gex_index ) { + + // Make reference genome + CELLRANGER_MKREF( + ch_fasta, + CELLRANGER_MKGTF.out.gtf, + "gex_reference" + ) + ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) + ch_cellranger_gex_index = CELLRANGER_MKREF.out.reference.ifEmpty { [] } + + } else { + ch_cellranger_gex_index = cellranger_gex_index + } + + // + // Prepare vdj reference (Special) + // + if ( !cellranger_vdj_index ) { + + if ( !params.skip_cellrangermulti_vdjref ) { // if user uses cellranger multi but does not have VDJ data + // Make reference genome + CELLRANGER_MKVDJREF( + ch_fasta, + CELLRANGER_MKGTF.out.gtf, + [], // currently ignoring the 'seqs' option + "vdj_reference" + ) + ch_versions = ch_versions.mix(CELLRANGER_MKVDJREF.out.versions) + ch_cellranger_vdj_index = CELLRANGER_MKVDJREF.out.reference.ifEmpty { [] } + } else { + ch_cellranger_vdj_index = [] + } + + } else { + ch_cellranger_vdj_index = cellranger_vdj_index + } + + // + // MODULE: cellranger multi + // + CELLRANGER_MULTI( + ch_grouped_fastq.gex.map{ it[0] }, + ch_grouped_fastq.gex, + ch_grouped_fastq.vdj, + ch_grouped_fastq.ab, + ch_grouped_fastq.beam, + ch_grouped_fastq.cmo, + ch_grouped_fastq.crispr, + ch_cellranger_gex_index, + ch_gex_frna_probeset, + ch_gex_target_panel, + ch_cellranger_vdj_index, + ch_vdj_primer_index, + ch_fb_reference, + ch_beam_antigen_panel_csv, + ch_beam_control_panel_csv, + ch_gex_cmo_set, + ch_cmo_barcode_csv, + [], + ch_frna_sample_csv, + params.skip_cellranger_renaming + ) + ch_versions = ch_versions.mix(CELLRANGER_MULTI.out.versions) + + // + // Cellranger multi splits the results from each sample. So, a module execution will have: (1) a raw counts dir for all; + // (2) a filtered counts dir PER sample; (3) a raw counts dir PER sample + // + // Thus, cellranger multi outputs data from all identified samples in a single channel, which will cause file collision. + // + // For the conversion, we should convert the resulting files of each sample, thus, now, we must parse the names + // of the filtered 'per_sample_outs' of cellranger/multi on the split the channels raw / filtered. + // + + // Split channels of raw and filtered to avoid file collision problems when loading the inputs in conversion modules. + ch_matrices_filtered = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "filtered_feature_bc_matrix" ) + ch_matrices_raw = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "raw_feature_bc_matrix" ) + + emit: + ch_versions + cellrangermulti_out = CELLRANGER_MULTI.out.outs + cellrangermulti_mtx = ch_matrices_raw.mix( ch_matrices_filtered ) +} + +def parse_demultiplexed_output_channels(in_ch, pattern) { + out_ch = + in_ch.map { meta, mtx_files -> + def desired_files = [] + mtx_files.each{ if ( it.toString().contains("${pattern}") ) { desired_files.add( it ) } } + [ meta, desired_files ] + } // separate only desired files + .transpose() // transpose for handling one meta/file pair at a time + .map { meta, mtx_files -> + def meta_clone = meta.clone() + if ( mtx_files.toString().contains("per_sample_outs") ) { + def demultiplexed_sample_id = mtx_files.toString().split('/per_sample_outs/')[1].split('/')[0] + meta_clone.id = demultiplexed_sample_id.toString() + } + [ meta_clone, mtx_files ] + } // check if output is from demultiplexed sample, if yes, correct meta.id for proper conversion naming + .groupTuple( by: 0 ) // group it back as one file collection per sample + + return out_ch +} From 06e99328e4be6d0c18443cbed2c36410ab35327e Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 30 Jul 2024 11:18:10 +0200 Subject: [PATCH 03/22] separate processes that generate reference files from cellranger multi --- .../local/align_cellrangermulti_idx.nf | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 subworkflows/local/align_cellrangermulti_idx.nf diff --git a/subworkflows/local/align_cellrangermulti_idx.nf b/subworkflows/local/align_cellrangermulti_idx.nf new file mode 100644 index 00000000..5b299d8d --- /dev/null +++ b/subworkflows/local/align_cellrangermulti_idx.nf @@ -0,0 +1,227 @@ +// +// Include modules +// +include { CELLRANGER_MKGTF } from "../../modules/nf-core/cellranger/mkgtf/main.nf" +include { CELLRANGER_MKREF } from "../../modules/nf-core/cellranger/mkref/main.nf" +include { CELLRANGER_MKVDJREF } from "../../modules/nf-core/cellranger/mkvdjref/main.nf" +include { CELLRANGER_MULTI } from "../../modules/nf-core/cellranger/multi/main.nf" +include { PARSE_CELLRANGERMULTI_SAMPLESHEET } from "../../modules/local/parse_cellrangermulti_samplesheet.nf" + +// Define workflow to subset and index a genome region fasta file +workflow CELLRANGER_MULTI_ALIGN { + take: + ch_fasta + ch_gtf + ch_fastq + cellranger_gex_index + cellranger_vdj_index + ch_multi_samplesheet + + main: + ch_versions = Channel.empty() + + // + // TODO: Include checkers for cellranger multi parameter combinations. For example, when VDJ data is given, require VDJ ref. If FFPE, require frna probe sets, etc. + // + + // since we merged all data as a meta, now we have a channel per sample, which + // every item is a meta map for each data-type + // now we can split it back for passing as input to the module + ch_fastq + .flatten() + .map{ meta -> + def meta_clone = meta.clone() + def data_dict = meta_clone.find{ it.key == "${meta_clone.feature_type}" } + fastqs = data_dict?.value + meta_clone.remove( data_dict?.key ) + [ meta_clone, fastqs ] + } + .branch { + meta, fastq -> + gex: meta.feature_type == "gex" + return [ meta, fastq ] + vdj: meta.feature_type == "vdj" + return [ meta, fastq ] + ab: meta.feature_type == "ab" + return [ meta, fastq ] + beam: meta.feature_type == "beam" + return [ meta, fastq ] + crispr: meta.feature_type == "crispr" + return [ meta, fastq ] + cmo: meta.feature_type == "cmo" + return [ meta, fastq ] + } + .set { ch_grouped_fastq } + ch_grouped_fastq.gex.view { it } + + // Assign other cellranger reference files + ch_gex_frna_probeset = params.gex_frna_probe_set ? file(params.gex_frna_probe_set) : [] + ch_gex_target_panel = params.gex_target_panel ? file(params.gex_target_panel) : [] + ch_gex_cmo_set = params.gex_cmo_set ? file(params.gex_cmo_set) : [] + ch_gex_barcodes = params.gex_barcode_sample_assignment ? file(params.gex_barcode_sample_assignment) : [] + ch_fb_reference = params.fb_reference ? file(params.fb_reference) : [] + ch_vdj_primer_index = params.vdj_inner_enrichment_primers ? file(params.vdj_inner_enrichment_primers) : [] + ch_beam_antigen_panel_csv = [] // currently not implemented + ch_beam_control_panel_csv = [] // currently not implemented + + // parse frna and barcode information + if (ch_multi_samplesheet) { + + // + // Here, we parse the received cellranger multi barcodes samplesheet. + // We first use the get the PARSE_CELLRANGERMULTI_SAMPLESHEET module to check it and guarantee structure + // and also split it to have one fnra/cmo .csv for each sample. + // + // The selection of the GEX fastqs is because samples are always expected to have at least GEX data. + // Then, using "combined" map, which means, the "additional barcode information" of each sample, we then, + // parse it to generate the cmo / frna samplesheets to be used by each sample. + // + // Here, to guarantee it and take advantage of the "FIFO"-rule and are sure that the data used in the + // module is from the same sample from the "normal" samplesheet. We have to use the .concat().groupTuple() + // pipe instead of .join() because .join() outputs first the arrays that could be joined and afterwards + // the ones with "remainders", thus, we would not ensure "FIFO" and the same order. + // + // To guarantee this, we can define two nf-tests, one having only one sample with CMO and another with two + // samples using CMOs, even if wrongly/repeated, but just to guarantee FIFO is working. + // + + PARSE_CELLRANGERMULTI_SAMPLESHEET( ch_multi_samplesheet ) + + ch_grouped_fastq.gex + .map{ [it[0].id] } + .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.cmo.flatten().map { [ "${it.baseName}" - "_cmo", it ] } ) + .groupTuple() + .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, cmo.csv ] + .set { ch_cmo_barcode_csv } + + ch_grouped_fastq.gex + .map{ [it[0].id] } + .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.frna.flatten().map { [ "${it.baseName}" - "_frna", it ] } ) + .groupTuple() + .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, frna.csv ] + .set { ch_frna_sample_csv } + + } else { + ch_cmo_barcode_csv = [] + ch_frna_sample_csv = [] + } + + // + // Prepare GTF + // + if ( !cellranger_gex_index || (!cellranger_vdj_index && !params.skip_cellrangermulti_vdjref) ) { + + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGER_MKGTF ( ch_gtf ) + ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) + + } + + // + // Prepare gex reference (Normal Ref) + // + if ( !cellranger_gex_index ) { + + // Make reference genome + CELLRANGER_MKREF( + ch_fasta, + CELLRANGER_MKGTF.out.gtf, + "gex_reference" + ) + ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) + ch_cellranger_gex_index = CELLRANGER_MKREF.out.reference.ifEmpty { [] } + + } else { + ch_cellranger_gex_index = cellranger_gex_index + } + + // + // Prepare vdj reference (Special) + // + if ( !cellranger_vdj_index ) { + + if ( !params.skip_cellrangermulti_vdjref ) { // if user uses cellranger multi but does not have VDJ data + // Make reference genome + CELLRANGER_MKVDJREF( + ch_fasta, + CELLRANGER_MKGTF.out.gtf, + [], // currently ignoring the 'seqs' option + "vdj_reference" + ) + ch_versions = ch_versions.mix(CELLRANGER_MKVDJREF.out.versions) + ch_cellranger_vdj_index = CELLRANGER_MKVDJREF.out.reference.ifEmpty { [] } + } else { + ch_cellranger_vdj_index = [] + } + + } else { + ch_cellranger_vdj_index = cellranger_vdj_index + } + + // + // MODULE: cellranger multi + // + CELLRANGER_MULTI( + ch_grouped_fastq.gex.map{ it[0] }, + ch_grouped_fastq.gex, + ch_grouped_fastq.vdj, + ch_grouped_fastq.ab, + ch_grouped_fastq.beam, + ch_grouped_fastq.cmo, + ch_grouped_fastq.crispr, + ch_cellranger_gex_index, + ch_gex_frna_probeset, + ch_gex_target_panel, + ch_cellranger_vdj_index, + ch_vdj_primer_index, + ch_fb_reference, + ch_beam_antigen_panel_csv, + ch_beam_control_panel_csv, + ch_gex_cmo_set, + ch_cmo_barcode_csv, + [], + ch_frna_sample_csv, + params.skip_cellranger_renaming + ) + ch_versions = ch_versions.mix(CELLRANGER_MULTI.out.versions) + + // + // Cellranger multi splits the results from each sample. So, a module execution will have: (1) a raw counts dir for all; + // (2) a filtered counts dir PER sample; (3) a raw counts dir PER sample + // + // Thus, cellranger multi outputs data from all identified samples in a single channel, which will cause file collision. + // + // For the conversion, we should convert the resulting files of each sample, thus, now, we must parse the names + // of the filtered 'per_sample_outs' of cellranger/multi on the split the channels raw / filtered. + // + + // Split channels of raw and filtered to avoid file collision problems when loading the inputs in conversion modules. + ch_matrices_filtered = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "filtered_feature_bc_matrix" ) + ch_matrices_raw = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "raw_feature_bc_matrix" ) + + emit: + ch_versions + cellrangermulti_out = CELLRANGER_MULTI.out.outs + cellrangermulti_mtx = ch_matrices_raw.mix( ch_matrices_filtered ) +} + +def parse_demultiplexed_output_channels(in_ch, pattern) { + out_ch = + in_ch.map { meta, mtx_files -> + def desired_files = [] + mtx_files.each{ if ( it.toString().contains("${pattern}") ) { desired_files.add( it ) } } + [ meta, desired_files ] + } // separate only desired files + .transpose() // transpose for handling one meta/file pair at a time + .map { meta, mtx_files -> + def meta_clone = meta.clone() + if ( mtx_files.toString().contains("per_sample_outs") ) { + def demultiplexed_sample_id = mtx_files.toString().split('/per_sample_outs/')[1].split('/')[0] + meta_clone.id = demultiplexed_sample_id.toString() + } + [ meta_clone, mtx_files ] + } // check if output is from demultiplexed sample, if yes, correct meta.id for proper conversion naming + .groupTuple( by: 0 ) // group it back as one file collection per sample + + return out_ch +} From 9bafa8d36e8baf1e51f55674cccec62194f4b327 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 13 Aug 2024 10:43:26 +0200 Subject: [PATCH 04/22] installed nf-core/module bamtofastq10x --- modules/nf-core/bamtofastq10x/environment.yml | 9 +++ modules/nf-core/bamtofastq10x/main.nf | 45 +++++++++++++++ modules/nf-core/bamtofastq10x/meta.yml | 48 ++++++++++++++++ .../nf-core/bamtofastq10x/tests/main.nf.test | 55 +++++++++++++++++++ .../bamtofastq10x/tests/main.nf.test.snap | 47 ++++++++++++++++ modules/nf-core/bamtofastq10x/tests/tags.yml | 2 + 6 files changed, 206 insertions(+) create mode 100644 modules/nf-core/bamtofastq10x/environment.yml create mode 100644 modules/nf-core/bamtofastq10x/main.nf create mode 100644 modules/nf-core/bamtofastq10x/meta.yml create mode 100644 modules/nf-core/bamtofastq10x/tests/main.nf.test create mode 100644 modules/nf-core/bamtofastq10x/tests/main.nf.test.snap create mode 100644 modules/nf-core/bamtofastq10x/tests/tags.yml diff --git a/modules/nf-core/bamtofastq10x/environment.yml b/modules/nf-core/bamtofastq10x/environment.yml new file mode 100644 index 00000000..d612f512 --- /dev/null +++ b/modules/nf-core/bamtofastq10x/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "bamtofastq10x" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::10x_bamtofastq=1.4.1" diff --git a/modules/nf-core/bamtofastq10x/main.nf b/modules/nf-core/bamtofastq10x/main.nf new file mode 100644 index 00000000..be1b1441 --- /dev/null +++ b/modules/nf-core/bamtofastq10x/main.nf @@ -0,0 +1,45 @@ +process BAMTOFASTQ10X { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/10x_bamtofastq:1.4.1--hdbdd923_2': + 'biocontainers/10x_bamtofastq:1.4.1--hdbdd923_2' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.fastq.gz"), emit: fastq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bamtofastq \\ + $args \\ + $bam \\ + ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bamtofastq10x: \$(bamtofastq --version |& sed '1!d ; s/bamtofastq //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bamtofastq10x: \$(bamtofastq --version |& sed '1!d ; s/bamtofastq //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bamtofastq10x/meta.yml b/modules/nf-core/bamtofastq10x/meta.yml new file mode 100644 index 00000000..2ac31a5c --- /dev/null +++ b/modules/nf-core/bamtofastq10x/meta.yml @@ -0,0 +1,48 @@ +name: bamtofastq10x + +description: Tool for converting 10x BAMs produced by Cell Ranger, Space Ranger, Cell Ranger ATAC, Cell Ranger DNA, and Long Ranger back to FASTQ files that can be used as inputs to re-run analysis + +keywords: + - bam + - convert + - fastq + - 10x + +tools: + - bamtofastq10x: + description: Tool for converting 10x BAMs produced by Cell Ranger, Space Ranger, Cell Ranger ATAC, Cell Ranger DNA, and Long Ranger back to FASTQ files that can be used as inputs to re-run analysis + homepage: https://github.com/10XGenomics/bamtofastq + documentation: https://github.com/10XGenomics/bamtofastq + tool_dev_url: https://github.com/10XGenomics/bamtofastq + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - bam: + type: file + description: BAM file + pattern: "*.bam" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - fastq: + type: file + description: fastq compressed file + pattern: "*.fastq.gz" + +authors: + - "@BlueBicycleBlog" +maintainers: + - "@BlueBicycleBlog" diff --git a/modules/nf-core/bamtofastq10x/tests/main.nf.test b/modules/nf-core/bamtofastq10x/tests/main.nf.test new file mode 100644 index 00000000..2b455ae8 --- /dev/null +++ b/modules/nf-core/bamtofastq10x/tests/main.nf.test @@ -0,0 +1,55 @@ +nextflow_process { + + name "Test Process BAMTOFASTQ10X" + script "../main.nf" + process "BAMTOFASTQ10X" + + tag "modules" + tag "modules_nfcore" + tag "bamtofastq10x" + + test("human - bam") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/10x_cr12.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + + ) + } + } + + + test("human - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/10x_cr12.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert process.success } + ) + } + + } +} diff --git a/modules/nf-core/bamtofastq10x/tests/main.nf.test.snap b/modules/nf-core/bamtofastq10x/tests/main.nf.test.snap new file mode 100644 index 00000000..d5346745 --- /dev/null +++ b/modules/nf-core/bamtofastq10x/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "human - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + [ + "bamtofastq_S1_L000_I1_001.fastq.gz:md5,a33682ac881de7a7453d79721b7621a0", + "bamtofastq_S1_L000_R1_001.fastq.gz:md5,5ccebf77d8636d7a7cdfc59737aea79f", + "bamtofastq_S1_L000_R2_001.fastq.gz:md5,2ee7c90e4307deba74065cfd00a65002" + ] + ] + ] + ], + "1": [ + "versions.yml:md5,845cd1d09c8a3d0059da9d074a9e5436" + ], + "fastq": [ + [ + { + "id": "test" + }, + [ + [ + "bamtofastq_S1_L000_I1_001.fastq.gz:md5,a33682ac881de7a7453d79721b7621a0", + "bamtofastq_S1_L000_R1_001.fastq.gz:md5,5ccebf77d8636d7a7cdfc59737aea79f", + "bamtofastq_S1_L000_R2_001.fastq.gz:md5,2ee7c90e4307deba74065cfd00a65002" + ] + ] + ] + ], + "versions": [ + "versions.yml:md5,845cd1d09c8a3d0059da9d074a9e5436" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.0" + }, + "timestamp": "2024-05-22T16:43:24.999397" + } +} \ No newline at end of file diff --git a/modules/nf-core/bamtofastq10x/tests/tags.yml b/modules/nf-core/bamtofastq10x/tests/tags.yml new file mode 100644 index 00000000..fe62227f --- /dev/null +++ b/modules/nf-core/bamtofastq10x/tests/tags.yml @@ -0,0 +1,2 @@ +bamtofastq10x: + - "modules/nf-core/bamtofastq10x/**" From 2e32c2efdb8a02d53d31e68278e6d984f537cf9f Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Thu, 15 Aug 2024 14:33:56 +0200 Subject: [PATCH 05/22] add null/ to the gitignore list --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bc675aba..4bb0c4cd 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ reports/ testme.sh .nf-test* .vscode +null/ \ No newline at end of file From 6072a4c6f08931b985cd1bd1f20bf0354f6bbddf Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Thu, 15 Aug 2024 14:35:20 +0200 Subject: [PATCH 06/22] add description of demultiplexing combined with immuneprofiling --- docs/output.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/output.md b/docs/output.md index 3ab87625..8f86d55e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -122,6 +122,8 @@ for the corresponding documentation. - Overall same output structure as cellranger. In case of multiplexed samples there will be one ouput folder for each demultiplexed sample, and one containing all (non-demultiplexed) cells. +- In case sample demultiplexing is to be followed by immune profiling, an extra output is added containing .fastq files + converted from the standard .bam file output. ## UniverSC From 27a8c16be0ee23cf073c4922f3481fd2dff88b7b Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Thu, 15 Aug 2024 14:37:25 +0200 Subject: [PATCH 07/22] add bamtofastq10x module with amendments --- modules.json | 5 +++++ modules/nf-core/bamtofastq10x/main.nf | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/modules.json b/modules.json index aa186d98..e2d946c5 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bamtofastq10x": { + "branch": "master", + "git_sha": "63d6994f4f85c0628b7f2ac1e7097136c1b4be34", + "installed_by": ["modules"] + }, "cellranger/count": { "branch": "master", "git_sha": "90dad5491658049282ceb287a3d7732c1ce39837", diff --git a/modules/nf-core/bamtofastq10x/main.nf b/modules/nf-core/bamtofastq10x/main.nf index be1b1441..c321878c 100644 --- a/modules/nf-core/bamtofastq10x/main.nf +++ b/modules/nf-core/bamtofastq10x/main.nf @@ -11,8 +11,8 @@ process BAMTOFASTQ10X { tuple val(meta), path(bam) output: - tuple val(meta), path("*.fastq.gz"), emit: fastq - path "versions.yml" , emit: versions + tuple val(meta), path("${meta.id}/**/*.fastq.gz"), emit: fastq + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -24,7 +24,16 @@ process BAMTOFASTQ10X { bamtofastq \\ $args \\ $bam \\ - ${prefix}.fastq.gz + $prefix + + out_dir=\$(find . -type d -maxdepth 2 -print | grep -m1 '${meta.sample_id}_0_1') + echo \${out_dir} + + for file in $prefix/${meta.sample_id}_0_1*/*.fastq.gz; + do + echo \$file + mv "\$file" "\${file/bamtofastq/$prefix}"; + done cat <<-END_VERSIONS > versions.yml "${task.process}": From a1d0170305265795d7cc3c5efd54cd37e581ac7a Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Thu, 15 Aug 2024 14:43:21 +0200 Subject: [PATCH 08/22] move reference creation outside the cellranger multi to avoid rerunning. additionally changing input channels --- subworkflows/local/align_cellrangermulti.nf | 61 +------- .../local/align_cellrangermulti_idx.nf | 144 +----------------- 2 files changed, 5 insertions(+), 200 deletions(-) diff --git a/subworkflows/local/align_cellrangermulti.nf b/subworkflows/local/align_cellrangermulti.nf index 977bf478..9073c0a1 100644 --- a/subworkflows/local/align_cellrangermulti.nf +++ b/subworkflows/local/align_cellrangermulti.nf @@ -1,20 +1,15 @@ // // Include modules // -include { CELLRANGER_MKGTF } from "../../modules/nf-core/cellranger/mkgtf/main.nf" -include { CELLRANGER_MKREF } from "../../modules/nf-core/cellranger/mkref/main.nf" -include { CELLRANGER_MKVDJREF } from "../../modules/nf-core/cellranger/mkvdjref/main.nf" include { CELLRANGER_MULTI } from "../../modules/nf-core/cellranger/multi/main.nf" include { PARSE_CELLRANGERMULTI_SAMPLESHEET } from "../../modules/local/parse_cellrangermulti_samplesheet.nf" // Define workflow to subset and index a genome region fasta file workflow CELLRANGER_MULTI_ALIGN { take: - ch_fasta - ch_gtf ch_fastq - cellranger_gex_index - cellranger_vdj_index + ch_cellranger_gex_index + ch_cellranger_vdj_index ch_multi_samplesheet main: @@ -109,58 +104,6 @@ workflow CELLRANGER_MULTI_ALIGN { ch_frna_sample_csv = [] } - // - // Prepare GTF - // - if ( !cellranger_gex_index || (!cellranger_vdj_index && !params.skip_cellrangermulti_vdjref) ) { - - // Filter GTF based on gene biotypes passed in params.modules - CELLRANGER_MKGTF ( ch_gtf ) - ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) - - } - - // - // Prepare gex reference (Normal Ref) - // - if ( !cellranger_gex_index ) { - - // Make reference genome - CELLRANGER_MKREF( - ch_fasta, - CELLRANGER_MKGTF.out.gtf, - "gex_reference" - ) - ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) - ch_cellranger_gex_index = CELLRANGER_MKREF.out.reference.ifEmpty { [] } - - } else { - ch_cellranger_gex_index = cellranger_gex_index - } - - // - // Prepare vdj reference (Special) - // - if ( !cellranger_vdj_index ) { - - if ( !params.skip_cellrangermulti_vdjref ) { // if user uses cellranger multi but does not have VDJ data - // Make reference genome - CELLRANGER_MKVDJREF( - ch_fasta, - CELLRANGER_MKGTF.out.gtf, - [], // currently ignoring the 'seqs' option - "vdj_reference" - ) - ch_versions = ch_versions.mix(CELLRANGER_MKVDJREF.out.versions) - ch_cellranger_vdj_index = CELLRANGER_MKVDJREF.out.reference.ifEmpty { [] } - } else { - ch_cellranger_vdj_index = [] - } - - } else { - ch_cellranger_vdj_index = cellranger_vdj_index - } - // // MODULE: cellranger multi // diff --git a/subworkflows/local/align_cellrangermulti_idx.nf b/subworkflows/local/align_cellrangermulti_idx.nf index 5b299d8d..3396fc52 100644 --- a/subworkflows/local/align_cellrangermulti_idx.nf +++ b/subworkflows/local/align_cellrangermulti_idx.nf @@ -4,18 +4,14 @@ include { CELLRANGER_MKGTF } from "../../modules/nf-core/cellranger/mkgtf/main.nf" include { CELLRANGER_MKREF } from "../../modules/nf-core/cellranger/mkref/main.nf" include { CELLRANGER_MKVDJREF } from "../../modules/nf-core/cellranger/mkvdjref/main.nf" -include { CELLRANGER_MULTI } from "../../modules/nf-core/cellranger/multi/main.nf" -include { PARSE_CELLRANGERMULTI_SAMPLESHEET } from "../../modules/local/parse_cellrangermulti_samplesheet.nf" // Define workflow to subset and index a genome region fasta file -workflow CELLRANGER_MULTI_ALIGN { +workflow CELLRANGER_MULTI_REF { take: ch_fasta ch_gtf - ch_fastq cellranger_gex_index cellranger_vdj_index - ch_multi_samplesheet main: ch_versions = Channel.empty() @@ -24,36 +20,6 @@ workflow CELLRANGER_MULTI_ALIGN { // TODO: Include checkers for cellranger multi parameter combinations. For example, when VDJ data is given, require VDJ ref. If FFPE, require frna probe sets, etc. // - // since we merged all data as a meta, now we have a channel per sample, which - // every item is a meta map for each data-type - // now we can split it back for passing as input to the module - ch_fastq - .flatten() - .map{ meta -> - def meta_clone = meta.clone() - def data_dict = meta_clone.find{ it.key == "${meta_clone.feature_type}" } - fastqs = data_dict?.value - meta_clone.remove( data_dict?.key ) - [ meta_clone, fastqs ] - } - .branch { - meta, fastq -> - gex: meta.feature_type == "gex" - return [ meta, fastq ] - vdj: meta.feature_type == "vdj" - return [ meta, fastq ] - ab: meta.feature_type == "ab" - return [ meta, fastq ] - beam: meta.feature_type == "beam" - return [ meta, fastq ] - crispr: meta.feature_type == "crispr" - return [ meta, fastq ] - cmo: meta.feature_type == "cmo" - return [ meta, fastq ] - } - .set { ch_grouped_fastq } - ch_grouped_fastq.gex.view { it } - // Assign other cellranger reference files ch_gex_frna_probeset = params.gex_frna_probe_set ? file(params.gex_frna_probe_set) : [] ch_gex_target_panel = params.gex_target_panel ? file(params.gex_target_panel) : [] @@ -64,48 +30,6 @@ workflow CELLRANGER_MULTI_ALIGN { ch_beam_antigen_panel_csv = [] // currently not implemented ch_beam_control_panel_csv = [] // currently not implemented - // parse frna and barcode information - if (ch_multi_samplesheet) { - - // - // Here, we parse the received cellranger multi barcodes samplesheet. - // We first use the get the PARSE_CELLRANGERMULTI_SAMPLESHEET module to check it and guarantee structure - // and also split it to have one fnra/cmo .csv for each sample. - // - // The selection of the GEX fastqs is because samples are always expected to have at least GEX data. - // Then, using "combined" map, which means, the "additional barcode information" of each sample, we then, - // parse it to generate the cmo / frna samplesheets to be used by each sample. - // - // Here, to guarantee it and take advantage of the "FIFO"-rule and are sure that the data used in the - // module is from the same sample from the "normal" samplesheet. We have to use the .concat().groupTuple() - // pipe instead of .join() because .join() outputs first the arrays that could be joined and afterwards - // the ones with "remainders", thus, we would not ensure "FIFO" and the same order. - // - // To guarantee this, we can define two nf-tests, one having only one sample with CMO and another with two - // samples using CMOs, even if wrongly/repeated, but just to guarantee FIFO is working. - // - - PARSE_CELLRANGERMULTI_SAMPLESHEET( ch_multi_samplesheet ) - - ch_grouped_fastq.gex - .map{ [it[0].id] } - .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.cmo.flatten().map { [ "${it.baseName}" - "_cmo", it ] } ) - .groupTuple() - .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, cmo.csv ] - .set { ch_cmo_barcode_csv } - - ch_grouped_fastq.gex - .map{ [it[0].id] } - .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.frna.flatten().map { [ "${it.baseName}" - "_frna", it ] } ) - .groupTuple() - .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, frna.csv ] - .set { ch_frna_sample_csv } - - } else { - ch_cmo_barcode_csv = [] - ch_frna_sample_csv = [] - } - // // Prepare GTF // @@ -158,70 +82,8 @@ workflow CELLRANGER_MULTI_ALIGN { ch_cellranger_vdj_index = cellranger_vdj_index } - // - // MODULE: cellranger multi - // - CELLRANGER_MULTI( - ch_grouped_fastq.gex.map{ it[0] }, - ch_grouped_fastq.gex, - ch_grouped_fastq.vdj, - ch_grouped_fastq.ab, - ch_grouped_fastq.beam, - ch_grouped_fastq.cmo, - ch_grouped_fastq.crispr, - ch_cellranger_gex_index, - ch_gex_frna_probeset, - ch_gex_target_panel, - ch_cellranger_vdj_index, - ch_vdj_primer_index, - ch_fb_reference, - ch_beam_antigen_panel_csv, - ch_beam_control_panel_csv, - ch_gex_cmo_set, - ch_cmo_barcode_csv, - [], - ch_frna_sample_csv, - params.skip_cellranger_renaming - ) - ch_versions = ch_versions.mix(CELLRANGER_MULTI.out.versions) - - // - // Cellranger multi splits the results from each sample. So, a module execution will have: (1) a raw counts dir for all; - // (2) a filtered counts dir PER sample; (3) a raw counts dir PER sample - // - // Thus, cellranger multi outputs data from all identified samples in a single channel, which will cause file collision. - // - // For the conversion, we should convert the resulting files of each sample, thus, now, we must parse the names - // of the filtered 'per_sample_outs' of cellranger/multi on the split the channels raw / filtered. - // - - // Split channels of raw and filtered to avoid file collision problems when loading the inputs in conversion modules. - ch_matrices_filtered = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "filtered_feature_bc_matrix" ) - ch_matrices_raw = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "raw_feature_bc_matrix" ) - emit: ch_versions - cellrangermulti_out = CELLRANGER_MULTI.out.outs - cellrangermulti_mtx = ch_matrices_raw.mix( ch_matrices_filtered ) -} - -def parse_demultiplexed_output_channels(in_ch, pattern) { - out_ch = - in_ch.map { meta, mtx_files -> - def desired_files = [] - mtx_files.each{ if ( it.toString().contains("${pattern}") ) { desired_files.add( it ) } } - [ meta, desired_files ] - } // separate only desired files - .transpose() // transpose for handling one meta/file pair at a time - .map { meta, mtx_files -> - def meta_clone = meta.clone() - if ( mtx_files.toString().contains("per_sample_outs") ) { - def demultiplexed_sample_id = mtx_files.toString().split('/per_sample_outs/')[1].split('/')[0] - meta_clone.id = demultiplexed_sample_id.toString() - } - [ meta_clone, mtx_files ] - } // check if output is from demultiplexed sample, if yes, correct meta.id for proper conversion naming - .groupTuple( by: 0 ) // group it back as one file collection per sample - - return out_ch + ch_cellranger_gex_index + ch_cellranger_vdj_index } From 170ec071d7a3d0681b8506950fd894ae2eb3cb95 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Thu, 15 Aug 2024 14:45:27 +0200 Subject: [PATCH 09/22] add subworkflow specific for handling sample demultiplexing followed by immune profiling --- .../local/align_cellrangermulti_vdj.nf | 183 +++++++++++------- 1 file changed, 114 insertions(+), 69 deletions(-) diff --git a/subworkflows/local/align_cellrangermulti_vdj.nf b/subworkflows/local/align_cellrangermulti_vdj.nf index f853c001..13919ab5 100644 --- a/subworkflows/local/align_cellrangermulti_vdj.nf +++ b/subworkflows/local/align_cellrangermulti_vdj.nf @@ -1,21 +1,19 @@ // // Include modules // -include { CELLRANGER_MKGTF } from "../../modules/nf-core/cellranger/mkgtf/main.nf" -include { CELLRANGER_MKREF } from "../../modules/nf-core/cellranger/mkref/main.nf" -include { CELLRANGER_MKVDJREF } from "../../modules/nf-core/cellranger/mkvdjref/main.nf" -include { CELLRANGER_MULTI } from "../../modules/nf-core/cellranger/multi/main.nf" -include { PARSE_CELLRANGERMULTI_SAMPLESHEET } from "../../modules/local/parse_cellrangermulti_samplesheet.nf" +include { CELLRANGER_MULTI as CELLRANGER_MULTI_DEMUX } from "../../modules/nf-core/cellranger/multi/main.nf" +include { CELLRANGER_MULTI as CELLRANGER_MULTI_IMMUNE } from "../../modules/nf-core/cellranger/multi/main.nf" +include { PARSE_CELLRANGERMULTI_SAMPLESHEET } from "../../modules/local/parse_cellrangermulti_samplesheet.nf" +include { BAMTOFASTQ10X } from '../../modules/nf-core/bamtofastq10x/main' // Define workflow to subset and index a genome region fasta file -workflow CELLRANGER_MULTI_ALIGN { +workflow CELLRANGER_MULTI_ALIGN_VDJ { take: - ch_fasta - ch_gtf ch_fastq - cellranger_gex_index - cellranger_vdj_index + ch_cellranger_gex_index + ch_cellranger_vdj_index ch_multi_samplesheet + empty_file main: ch_versions = Channel.empty() @@ -53,6 +51,28 @@ workflow CELLRANGER_MULTI_ALIGN { } .set { ch_grouped_fastq } + // Add faux VDJ channel to first run cellranger without immune profiling + ch_grouped_fastq.vdj.map { meta, fastqs -> + def meta_clone = meta.clone() + meta_clone.options = "[:]" + [meta_clone, empty_file] + } + .set { ch_faux_vdj_fastq } + // Add faux CMO channel to first run cellranger without sample demultiplexing + ch_grouped_fastq.cmo.map { meta, fastqs -> + def meta_clone = meta.clone() + meta_clone.options = "[:]" + [meta_clone, empty_file] + } + .set { ch_faux_cmo_fastq } + // Add faux Ab channel + ch_grouped_fastq.ab.map { meta, fastqs -> + def meta_clone = meta.clone() + meta_clone.options = "[:]" + [meta_clone, empty_file] + } + .set { ch_faux_ab_fastq } + // Assign other cellranger reference files ch_gex_frna_probeset = params.gex_frna_probe_set ? file(params.gex_frna_probe_set) : [] ch_gex_target_panel = params.gex_target_panel ? file(params.gex_target_panel) : [] @@ -88,14 +108,14 @@ workflow CELLRANGER_MULTI_ALIGN { ch_grouped_fastq.gex .map{ [it[0].id] } - .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.cmo.flatten().map { [ "${it.baseName}" - "_cmo", it ] } ) + .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.cmo.flatten().map { [get_sample_id(it, "_cmo"), it ] } ) .groupTuple() .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, cmo.csv ] .set { ch_cmo_barcode_csv } ch_grouped_fastq.gex .map{ [it[0].id] } - .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.frna.flatten().map { [ "${it.baseName}" - "_frna", it ] } ) + .concat( PARSE_CELLRANGERMULTI_SAMPLESHEET.out.frna.flatten().map { [get_sample_id(it, "_frna"), it ] } ) .groupTuple() .map { if ( it.size() == 2 ) { it[1] } else { [] } } // a correct tuple from snippet will have: [ sample, frna.csv ] .set { ch_frna_sample_csv } @@ -106,67 +126,53 @@ workflow CELLRANGER_MULTI_ALIGN { } // - // Prepare GTF - // - if ( !cellranger_gex_index || (!cellranger_vdj_index && !params.skip_cellrangermulti_vdjref) ) { - - // Filter GTF based on gene biotypes passed in params.modules - CELLRANGER_MKGTF ( ch_gtf ) - ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) - - } - - // - // Prepare gex reference (Normal Ref) + // MODULE: cellranger multi // - if ( !cellranger_gex_index ) { - - // Make reference genome - CELLRANGER_MKREF( - ch_fasta, - CELLRANGER_MKGTF.out.gtf, - "gex_reference" - ) - ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) - ch_cellranger_gex_index = CELLRANGER_MKREF.out.reference.ifEmpty { [] } - - } else { - ch_cellranger_gex_index = cellranger_gex_index - } + CELLRANGER_MULTI_DEMUX( + ch_grouped_fastq.gex.map{ it[0] }, + ch_grouped_fastq.gex, + ch_faux_vdj_fastq, + ch_faux_ab_fastq, + ch_grouped_fastq.beam, + ch_grouped_fastq.cmo, + ch_grouped_fastq.crispr, + ch_cellranger_gex_index, + ch_gex_frna_probeset, + ch_gex_target_panel, + ch_cellranger_vdj_index, + ch_vdj_primer_index, + ch_fb_reference, + ch_beam_antigen_panel_csv, + ch_beam_control_panel_csv, + ch_gex_cmo_set, + ch_cmo_barcode_csv, + [], + ch_frna_sample_csv, + params.skip_cellranger_renaming + ) + ch_versions = ch_versions.mix(CELLRANGER_MULTI_DEMUX.out.versions) + ch_bam_files = extract_bam(CELLRANGER_MULTI_DEMUX.out.outs) // - // Prepare vdj reference (Special) + // MODULE: bam to fastq // - if ( !cellranger_vdj_index ) { - - if ( !params.skip_cellrangermulti_vdjref ) { // if user uses cellranger multi but does not have VDJ data - // Make reference genome - CELLRANGER_MKVDJREF( - ch_fasta, - CELLRANGER_MKGTF.out.gtf, - [], // currently ignoring the 'seqs' option - "vdj_reference" - ) - ch_versions = ch_versions.mix(CELLRANGER_MKVDJREF.out.versions) - ch_cellranger_vdj_index = CELLRANGER_MKVDJREF.out.reference.ifEmpty { [] } - } else { - ch_cellranger_vdj_index = [] - } - - } else { - ch_cellranger_vdj_index = cellranger_vdj_index - } - + BAMTOFASTQ10X( + ch_bam_files + ) + ch_versions = ch_versions.mix(BAMTOFASTQ10X.out.versions) + BAMTOFASTQ10X.out.fastq.view { "bamtofq10x: $it" } + ch_grouped_fastq.gex.view { "gex: $it" } + // // MODULE: cellranger multi // - CELLRANGER_MULTI( - ch_grouped_fastq.gex.map{ it[0] }, - ch_grouped_fastq.gex, + CELLRANGER_MULTI_IMMUNE( + BAMTOFASTQ10X.out.fastq.map{ it[0] }, + BAMTOFASTQ10X.out.fastq, ch_grouped_fastq.vdj, ch_grouped_fastq.ab, ch_grouped_fastq.beam, - ch_grouped_fastq.cmo, + ch_faux_cmo_fastq, ch_grouped_fastq.crispr, ch_cellranger_gex_index, ch_gex_frna_probeset, @@ -176,13 +182,13 @@ workflow CELLRANGER_MULTI_ALIGN { ch_fb_reference, ch_beam_antigen_panel_csv, ch_beam_control_panel_csv, - ch_gex_cmo_set, - ch_cmo_barcode_csv, + [], + [], [], ch_frna_sample_csv, params.skip_cellranger_renaming ) - ch_versions = ch_versions.mix(CELLRANGER_MULTI.out.versions) + ch_versions = ch_versions.mix(CELLRANGER_MULTI_IMMUNE.out.versions) // // Cellranger multi splits the results from each sample. So, a module execution will have: (1) a raw counts dir for all; @@ -195,15 +201,22 @@ workflow CELLRANGER_MULTI_ALIGN { // // Split channels of raw and filtered to avoid file collision problems when loading the inputs in conversion modules. - ch_matrices_filtered = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "filtered_feature_bc_matrix" ) - ch_matrices_raw = parse_demultiplexed_output_channels( CELLRANGER_MULTI.out.outs, "raw_feature_bc_matrix" ) + ch_matrices_filtered = parse_demultiplexed_output_channels( CELLRANGER_MULTI_IMMUNE.out.outs, "filtered_feature_bc_matrix" ) + ch_matrices_raw = parse_demultiplexed_output_channels( CELLRANGER_MULTI_IMMUNE.out.outs, "raw_feature_bc_matrix" ) emit: ch_versions - cellrangermulti_out = CELLRANGER_MULTI.out.outs + cellrangermulti_out = CELLRANGER_MULTI_IMMUNE.out.outs cellrangermulti_mtx = ch_matrices_raw.mix( ch_matrices_filtered ) } +def get_sample_id(in_ch, pattern="_cmo") { + def bname = in_ch.baseName + def idx = bname.lastIndexOf(pattern) + def modified_bname = (idx != -1) ? bname[0.. @@ -224,3 +237,35 @@ def parse_demultiplexed_output_channels(in_ch, pattern) { return out_ch } + + +def extract_bam(in_ch) { + out_ch = + in_ch.map { meta, bam_files -> + def desired_files = [] + bam_files.each{ if ( it.toString().endsWith("sample_alignments.bam") ) { desired_files.add( it ) } } + [ meta, desired_files ] + } + .transpose() // transpose for handling one meta/file pair at a time + .map { meta, bam_files -> + def meta_clone = meta.clone() + if ( bam_files.toString().contains("per_sample_outs") ) { + def demux_id = bam_files.toString().split('/per_sample_outs/')[1].split('/')[0] + meta_clone.sample_id = meta_clone.id + meta_clone.id = demux_id.toString() + } + [ meta_clone, bam_files ] + } // check if output is from demultiplexed sample, if yes, correct meta.id for proper conversion naming + .groupTuple( by: 0 ) // group it back as one file collection per sample + + return out_ch +} + +def extract_gex_fq(in_ch) { + out_ch = + in_ch.map { meta, fns -> + def desired_files = [] + fns.each{ if ( it.toString().contains("/demultiplex*_0_1_*/*.fastq.gz") ) { desired_files.add( it ) } } + + } +} \ No newline at end of file From 3b71ff8bfcd6df9bcfd7c6a23357db392779c57a Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Thu, 15 Aug 2024 14:49:35 +0200 Subject: [PATCH 10/22] implement cellranger multi ref and vdj. branch channels to either run cellranger multi or cellranger multi+vdj --- workflows/scrnaseq.nf | 100 +++++++++++++----------------------------- 1 file changed, 31 insertions(+), 69 deletions(-) diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 2584b928..ac0318c5 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -4,8 +4,9 @@ include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kalli include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' include { STARSOLO } from '../subworkflows/local/starsolo' include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" +include { CELLRANGER_MULTI_REF } from "../subworkflows/local/align_cellrangermulti_idx" include { CELLRANGER_MULTI_ALIGN } from "../subworkflows/local/align_cellrangermulti" -include { CELLRANGER_MULTI_VDJ_ALIGN } from "../subworkflows/local/align_cellrangermulti_vdj" +include { CELLRANGER_MULTI_ALIGN_VDJ } from "../subworkflows/local/align_cellrangermulti_vdj" include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" @@ -224,74 +225,13 @@ workflow SCRNASEQ { // Run cellrangermulti pipeline if (params.aligner == 'cellrangermulti') { - // parse the input data to generate a collected channel per sample, which will have - // the metadata and data for each data-type of every sample. - // then, inside the subworkflow, it can be parsed to manage inputs to the module - ch_fastq - .map { meta, fastqs -> - def parsed_meta = meta.clone() + [ "${meta.feature_type.toString()}": fastqs ] - parsed_meta.options = [:] - - // add an universal key to differentiate from empty channels so that the "&& meta_gex?.options" lines in the module main.nf can work properly - parsed_meta.options['data-available'] = true - - // add cellranger options that are currently handled by pipeline, coming from samplesheet - // the module parses them from the 'gex' options - if (meta.feature_type.toString() == 'gex') { - parsed_meta.options['create-bam'] = true // force bam creation -- param required by cellranger multi - if (meta.expected_cells) { parsed_meta.options['expected-cells'] = meta.expected_cells } - } - - [ parsed_meta.id , parsed_meta ] - } - .groupTuple( by: 0 ) - .map{ sample_id, map_collection -> - // Now we must check if every data possibility taken into account in the .branch() operation - // performed inside the CELLRANGER_MULTI_ALIGN subworkflow are initialized, even with empty files - // This to ensure that the sizes of each data channel is the same, and the the order and the data types - // are used together with its rightful pairs - // - // data.types: gex, vdj, ab, beam, crispr, cmo - - // clone ArrayBag (received from .groupTuple()) to avoid mutating the input - def map_collection_clone = [] - map_collection_clone.addAll(map_collection) - - // generate the expected EMPTY tuple when a data type is not used - // needs to have a collected map like that, so every sample from the samplesheet is analysed one at a time, - // allowing to have multiple samples in the sheet, having all the data-type tuples initialized, - // either empty or populated. It will be branched inside the subworkflow. - if (!map_collection_clone.any{ it.feature_type == 'gex' }) { map_collection_clone.add( [id: sample_id, feature_type: 'gex' , gex: empty_file, options:[:] ] ) } - if (!map_collection_clone.any{ it.feature_type == 'vdj' }) { map_collection_clone.add( [id: sample_id, feature_type: 'vdj' , vdj: empty_file, options:[:] ] ) } - if (!map_collection_clone.any{ it.feature_type == 'ab' }) { map_collection_clone.add( [id: sample_id, feature_type: 'ab' , ab: empty_file, options:[:] ] ) } - if (!map_collection_clone.any{ it.feature_type == 'beam' }) { map_collection_clone.add( [id: sample_id, feature_type: 'beam' , beam: empty_file, options:[:] ] ) } // currently not implemented, the input samplesheet checking will not allow it. - if (!map_collection_clone.any{ it.feature_type == 'crispr' }) { map_collection_clone.add( [id: sample_id, feature_type: 'crispr', crispr: empty_file, options:[:] ] ) } - if (!map_collection_clone.any{ it.feature_type == 'cmo' }) { map_collection_clone.add( [id: sample_id, feature_type: 'cmo' , cmo: empty_file, options:[:] ] ) } - - // return final map - map_collection_clone - } - .set{ ch_cellrangermulti_collected_channel } - - // Run cellranger multi - CELLRANGER_MULTI_ALIGN( + CELLRANGER_MULTI_REF( ch_genome_fasta, ch_filter_gtf, - ch_cellrangermulti_collected_channel, ch_cellranger_index, cellranger_vdj_index, - ch_multi_samplesheet ) - ch_versions = ch_versions.mix(CELLRANGER_MULTI_ALIGN.out.ch_versions) - ch_multiqc_files = ch_multiqc_files.mix( CELLRANGER_MULTI_ALIGN.out.cellrangermulti_out.map{ - meta, outs -> outs.findAll{ it -> it.name == "web_summary.html" } - }) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) - - } - - // Run cellrangermulti with immuneprofiling pipeline - if (params.aligner == 'cellrangermulti_vdj') { + ch_versions = ch_versions.mix(CELLRANGER_MULTI_REF.out.ch_versions) // parse the input data to generate a collected channel per sample, which will have // the metadata and data for each data-type of every sample. @@ -342,13 +282,22 @@ workflow SCRNASEQ { } .set{ ch_cellrangermulti_collected_channel } + // Split channel to either run standard cellranger multi or to run sample demultiplexing followed by immune profiling. + ch_cellrangermulti_collected_channel.branch { sample -> + def vdj_idx = sample.feature_type.findIndexOf{ it == 'vdj'} + def cmo_idx = sample.feature_type.findIndexOf{ it == 'cmo'} + demux_vdj: + // if files are listed for a feature_type, the value is null + // otherwise the value is a path to the empty file: assets/EMPTY + sample[vdj_idx].vdj == null && sample[cmo_idx].cmo == null + demux: true + }.set { ch_cellrangermulti_collected_channel_branched } + // Run cellranger multi CELLRANGER_MULTI_ALIGN( - ch_genome_fasta, - ch_filter_gtf, - ch_cellrangermulti_collected_channel, - ch_cellranger_index, - cellranger_vdj_index, + ch_cellrangermulti_collected_channel_branched.demux, + CELLRANGER_MULTI_REF.out.ch_cellranger_gex_index, + CELLRANGER_MULTI_REF.out.ch_cellranger_vdj_index, ch_multi_samplesheet ) ch_versions = ch_versions.mix(CELLRANGER_MULTI_ALIGN.out.ch_versions) @@ -357,6 +306,19 @@ workflow SCRNASEQ { }) ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) + // Run cellranger multi vdj + CELLRANGER_MULTI_ALIGN_VDJ( + ch_cellrangermulti_collected_channel_branched.demux_vdj, + CELLRANGER_MULTI_REF.out.ch_cellranger_gex_index, + CELLRANGER_MULTI_REF.out.ch_cellranger_vdj_index, + ch_multi_samplesheet, + empty_file + ) + ch_versions = ch_versions.mix(CELLRANGER_MULTI_ALIGN_VDJ.out.ch_versions) + ch_multiqc_files = ch_multiqc_files.mix( CELLRANGER_MULTI_ALIGN_VDJ.out.cellrangermulti_out.map{ + meta, outs -> outs.findAll{ it -> it.name == "web_summary.html" } + }) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_MULTI_ALIGN_VDJ.out.cellrangermulti_mtx) } // Run emptydrops calling module From 0132d9bbe5559d6875c447a1000e37402a7d97c2 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Thu, 15 Aug 2024 14:52:29 +0200 Subject: [PATCH 11/22] update publishDir for the two cellranger multi outputs. add publishDir to bamtofastq process --- conf/modules.config | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 81395a1d..ebc0ba9f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -218,7 +218,7 @@ if (params.aligner == 'kallisto') { if (params.aligner == 'cellrangermulti') { process { withName: FASTQC { ext.prefix = { "${meta.id}_${meta.feature_type}" } } // allow distinguishment of data types after renaming - withName: 'NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN:CELLRANGER_MULTI' { + withName: 'NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN:CELLRANGER_MULTI|NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN_VDJ:CELLRANGER_MULTI_DEMUX|NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN_VDJ:CELLRANGER_MULTI_IMMUNE' { ext.prefix = null // force it null, for some reason it was being wrongly read in the module publishDir = [ path: "${params.outdir}/${params.aligner}/count", @@ -250,5 +250,12 @@ if (params.aligner == 'cellrangermulti') { mode: params.publish_dir_mode ] } + withName: BAMTOFASTQ10X { + publishDir = [ + path: "${params.outdir}/${params.aligner}/bam2fastq", + mode: params.publish_dir_mode + ] + ext.args = "--reads-per-fastq=2200000000" + } } } From 1434e45c8ead9e6bb9eac8187ddb208d0feeb2ff Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Wed, 21 Aug 2024 08:52:12 +0200 Subject: [PATCH 12/22] add func to expand feature channels to match demultiplexed gex. modified extract_gex_fq --- .../local/align_cellrangermulti_vdj.nf | 48 +++++++++++++++---- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/align_cellrangermulti_vdj.nf b/subworkflows/local/align_cellrangermulti_vdj.nf index 13919ab5..2c56780c 100644 --- a/subworkflows/local/align_cellrangermulti_vdj.nf +++ b/subworkflows/local/align_cellrangermulti_vdj.nf @@ -160,20 +160,24 @@ workflow CELLRANGER_MULTI_ALIGN_VDJ { ch_bam_files ) ch_versions = ch_versions.mix(BAMTOFASTQ10X.out.versions) - BAMTOFASTQ10X.out.fastq.view { "bamtofq10x: $it" } - ch_grouped_fastq.gex.view { "gex: $it" } + ch_bamtofastq = extract_gex_fq(BAMTOFASTQ10X.out.fastq) + + ch_expanded_vdj = expand_feature_by_demultiplexed_samples(ch_grouped_fastq.vdj, ch_bamtofastq) + ch_expanded_ab = expand_feature_by_demultiplexed_samples(ch_grouped_fastq.ab, ch_bamtofastq) + ch_expanded_beam = expand_feature_by_demultiplexed_samples(ch_grouped_fastq.beam, ch_bamtofastq) + ch_expanded_crispr = expand_feature_by_demultiplexed_samples(ch_grouped_fastq.crispr, ch_bamtofastq) // // MODULE: cellranger multi // CELLRANGER_MULTI_IMMUNE( - BAMTOFASTQ10X.out.fastq.map{ it[0] }, - BAMTOFASTQ10X.out.fastq, - ch_grouped_fastq.vdj, - ch_grouped_fastq.ab, - ch_grouped_fastq.beam, + ch_bamtofastq.map{ it[0] }, + ch_bamtofastq, + ch_expanded_vdj, + ch_expanded_ab, + ch_expanded_beam, ch_faux_cmo_fastq, - ch_grouped_fastq.crispr, + ch_expanded_crispr, ch_cellranger_gex_index, ch_gex_frna_probeset, ch_gex_target_panel, @@ -264,8 +268,32 @@ def extract_bam(in_ch) { def extract_gex_fq(in_ch) { out_ch = in_ch.map { meta, fns -> + def meta_clone = meta.clone() + meta_clone.options['check-library-compatibility'] = false // def desired_files = [] - fns.each{ if ( it.toString().contains("/demultiplex*_0_1_*/*.fastq.gz") ) { desired_files.add( it ) } } - + fns.each{ if ( it.toString().contains("/${meta.sample_id}_0_1_") ) { desired_files.add( it ) } } + [ meta_clone, desired_files ] } + + return out_ch +} + +def expand_feature_by_demultiplexed_samples(in_ch, gex_ch) { + out_ch = + in_ch + .map{ meta, fns -> + def meta_clone = meta.clone() + meta_clone.sample_id = meta_clone.id + [meta_clone, fns] + } + .cross(gex_ch) { it[0][-1] } // test also it[0]["sample_id"] + .map{ftx, gex -> + def ftx_meta_clone = ftx[0].clone() + def gex_meta_clone = gex[0].clone() + assert ftx_meta_clone.sample_id == gex_meta_clone.sample_id + ftx_meta_clone.id = gex_meta_clone.id + [ftx_meta_clone, ftx[1]] + } + + return out_ch } \ No newline at end of file From 18b9eae6a317ceea5baaeeb64ff9a3afd7e6ed0b Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Wed, 21 Aug 2024 08:53:12 +0200 Subject: [PATCH 13/22] remove renaming of files --- modules/nf-core/bamtofastq10x/main.nf | 8 -------- 1 file changed, 8 deletions(-) diff --git a/modules/nf-core/bamtofastq10x/main.nf b/modules/nf-core/bamtofastq10x/main.nf index c321878c..03393d28 100644 --- a/modules/nf-core/bamtofastq10x/main.nf +++ b/modules/nf-core/bamtofastq10x/main.nf @@ -26,14 +26,6 @@ process BAMTOFASTQ10X { $bam \\ $prefix - out_dir=\$(find . -type d -maxdepth 2 -print | grep -m1 '${meta.sample_id}_0_1') - echo \${out_dir} - - for file in $prefix/${meta.sample_id}_0_1*/*.fastq.gz; - do - echo \$file - mv "\$file" "\${file/bamtofastq/$prefix}"; - done cat <<-END_VERSIONS > versions.yml "${task.process}": From ecf453fe4974a1b67f744cd9cbe124e327723cc2 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 3 Sep 2024 16:59:03 +0200 Subject: [PATCH 14/22] removed arg for BAMTOFASTQ and updated CELLRANGER_MULTI with regex --- conf/modules.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ebc0ba9f..92f28057 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -218,7 +218,7 @@ if (params.aligner == 'kallisto') { if (params.aligner == 'cellrangermulti') { process { withName: FASTQC { ext.prefix = { "${meta.id}_${meta.feature_type}" } } // allow distinguishment of data types after renaming - withName: 'NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN:CELLRANGER_MULTI|NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN_VDJ:CELLRANGER_MULTI_DEMUX|NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN_VDJ:CELLRANGER_MULTI_IMMUNE' { + withName: 'NFCORE_SCRNASEQ:SCRNASEQ:CELLRANGER_MULTI_ALIGN(_VDJ)?:CELLRANGER_MULTI.*' { ext.prefix = null // force it null, for some reason it was being wrongly read in the module publishDir = [ path: "${params.outdir}/${params.aligner}/count", @@ -255,7 +255,6 @@ if (params.aligner == 'cellrangermulti') { path: "${params.outdir}/${params.aligner}/bam2fastq", mode: params.publish_dir_mode ] - ext.args = "--reads-per-fastq=2200000000" } } } From 8359254ee19f7492f10179b114c772120eae8dbe Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 3 Sep 2024 17:01:15 +0200 Subject: [PATCH 15/22] changed faux channels to value channels to be consumed infinitely and updated extract_gex_fq() --- .../local/align_cellrangermulti_vdj.nf | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/align_cellrangermulti_vdj.nf b/subworkflows/local/align_cellrangermulti_vdj.nf index 2c56780c..7cfda04b 100644 --- a/subworkflows/local/align_cellrangermulti_vdj.nf +++ b/subworkflows/local/align_cellrangermulti_vdj.nf @@ -57,6 +57,7 @@ workflow CELLRANGER_MULTI_ALIGN_VDJ { meta_clone.options = "[:]" [meta_clone, empty_file] } + .first() // convert to value channel to be consumed indefinitely .set { ch_faux_vdj_fastq } // Add faux CMO channel to first run cellranger without sample demultiplexing ch_grouped_fastq.cmo.map { meta, fastqs -> @@ -64,6 +65,7 @@ workflow CELLRANGER_MULTI_ALIGN_VDJ { meta_clone.options = "[:]" [meta_clone, empty_file] } + .first() // convert to value channel to be consumed indefinitely .set { ch_faux_cmo_fastq } // Add faux Ab channel ch_grouped_fastq.ab.map { meta, fastqs -> @@ -71,6 +73,7 @@ workflow CELLRANGER_MULTI_ALIGN_VDJ { meta_clone.options = "[:]" [meta_clone, empty_file] } + .first() // convert to value channel to be consumed indefinitely .set { ch_faux_ab_fastq } // Assign other cellranger reference files @@ -189,7 +192,7 @@ workflow CELLRANGER_MULTI_ALIGN_VDJ { [], [], [], - ch_frna_sample_csv, + ch_frna_sample_csv, // currently not implemented nor tested params.skip_cellranger_renaming ) ch_versions = ch_versions.mix(CELLRANGER_MULTI_IMMUNE.out.versions) @@ -265,14 +268,31 @@ def extract_bam(in_ch) { return out_ch } +def extractParts(filename) { + // convert lane, read, and sequence number to integers to sort files. + def matcher = filename =~ /L(\d{3})_R(\d)_(\d{3})/ + if (matcher.find()) { + return [matcher.group(1).toInteger(), matcher.group(2).toInteger(), matcher.group(3).toInteger()] + } + return [0, 0, 0] // Default value if pattern not found +} + def extract_gex_fq(in_ch) { + // Extract GEX fastq files from bamtofastq output and sort files in read pairs. out_ch = in_ch.map { meta, fns -> def meta_clone = meta.clone() - meta_clone.options['check-library-compatibility'] = false // + meta_clone.options['check-library-compatibility'] = false // in order for downstream immune profiling not to fail. def desired_files = [] + // GEX fq files are located in the "*_0_1_*" directory as the multi config always starts with GEX files. fns.each{ if ( it.toString().contains("/${meta.sample_id}_0_1_") ) { desired_files.add( it ) } } - [ meta_clone, desired_files ] + // Sort files to pair R1 and R2 from the same lane (L001, L002, etc) and sequence number (001, 002, etc) + def sortedFiles = desired_files.sort { a, b -> + def partsA = extractParts(a) + def partsB = extractParts(b) + return partsA[0] <=> partsB[0] ?: partsA[2] <=> partsB[2] ?: partsA[1] <=> partsB[1] + } + [ meta_clone, sortedFiles ] } return out_ch From c4356b0f59b04960bddc3ea799b79cd79e939677 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 3 Sep 2024 17:07:35 +0200 Subject: [PATCH 16/22] remove unused code --- subworkflows/local/align_cellrangermulti_idx.nf | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/subworkflows/local/align_cellrangermulti_idx.nf b/subworkflows/local/align_cellrangermulti_idx.nf index 3396fc52..186cbddc 100644 --- a/subworkflows/local/align_cellrangermulti_idx.nf +++ b/subworkflows/local/align_cellrangermulti_idx.nf @@ -20,16 +20,6 @@ workflow CELLRANGER_MULTI_REF { // TODO: Include checkers for cellranger multi parameter combinations. For example, when VDJ data is given, require VDJ ref. If FFPE, require frna probe sets, etc. // - // Assign other cellranger reference files - ch_gex_frna_probeset = params.gex_frna_probe_set ? file(params.gex_frna_probe_set) : [] - ch_gex_target_panel = params.gex_target_panel ? file(params.gex_target_panel) : [] - ch_gex_cmo_set = params.gex_cmo_set ? file(params.gex_cmo_set) : [] - ch_gex_barcodes = params.gex_barcode_sample_assignment ? file(params.gex_barcode_sample_assignment) : [] - ch_fb_reference = params.fb_reference ? file(params.fb_reference) : [] - ch_vdj_primer_index = params.vdj_inner_enrichment_primers ? file(params.vdj_inner_enrichment_primers) : [] - ch_beam_antigen_panel_csv = [] // currently not implemented - ch_beam_control_panel_csv = [] // currently not implemented - // // Prepare GTF // From ced5ddc4f91089b0cf3cd8fb3f73f61abf2ada64 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Tue, 3 Sep 2024 17:08:36 +0200 Subject: [PATCH 17/22] renamed file --- .../{align_cellrangermulti_idx.nf => cellrangermulti_ref.nf} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename subworkflows/local/{align_cellrangermulti_idx.nf => cellrangermulti_ref.nf} (100%) diff --git a/subworkflows/local/align_cellrangermulti_idx.nf b/subworkflows/local/cellrangermulti_ref.nf similarity index 100% rename from subworkflows/local/align_cellrangermulti_idx.nf rename to subworkflows/local/cellrangermulti_ref.nf From 8c6e1c060fd2c72e57d92091a265a2d59ec68c86 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Wed, 4 Sep 2024 10:13:11 +0200 Subject: [PATCH 18/22] update output path for fastq --- modules/nf-core/bamtofastq10x/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/bamtofastq10x/main.nf b/modules/nf-core/bamtofastq10x/main.nf index 03393d28..904d6c14 100644 --- a/modules/nf-core/bamtofastq10x/main.nf +++ b/modules/nf-core/bamtofastq10x/main.nf @@ -11,8 +11,8 @@ process BAMTOFASTQ10X { tuple val(meta), path(bam) output: - tuple val(meta), path("${meta.id}/**/*.fastq.gz"), emit: fastq - path "versions.yml" , emit: versions + tuple val(meta), path("**/*.fastq.gz"), emit: fastq + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when From e22f8fb8019d0e468e27d69f0153461a17f75d73 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Wed, 4 Sep 2024 10:13:55 +0200 Subject: [PATCH 19/22] update output dir for emptydrops analysis --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index 92f28057..12f972ae 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -38,6 +38,7 @@ process { mode: params.publish_dir_mode, saveAs: { filename -> if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}" + else if ( params.aligner == 'cellrangermulti' ) "emptydrops/${meta.id}/${filename}" else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}" else "${meta.id}/${filename}" } From eff615882bbea18645faefe964c8ba8aee9aee70 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Wed, 4 Sep 2024 10:14:42 +0200 Subject: [PATCH 20/22] update filename for generating reference files for cellranger multi --- workflows/scrnaseq.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index ac0318c5..b6131526 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -4,7 +4,7 @@ include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kalli include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' include { STARSOLO } from '../subworkflows/local/starsolo' include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" -include { CELLRANGER_MULTI_REF } from "../subworkflows/local/align_cellrangermulti_idx" +include { CELLRANGER_MULTI_REF } from "../subworkflows/local/cellrangermulti_ref" include { CELLRANGER_MULTI_ALIGN } from "../subworkflows/local/align_cellrangermulti" include { CELLRANGER_MULTI_ALIGN_VDJ } from "../subworkflows/local/align_cellrangermulti_vdj" include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" From 998f9674d2610f8d9ae6e71852f6c75f3858a75a Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Mon, 9 Sep 2024 16:36:42 +0200 Subject: [PATCH 21/22] remove frna option for immune-profiling --- subworkflows/local/align_cellrangermulti_vdj.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/align_cellrangermulti_vdj.nf b/subworkflows/local/align_cellrangermulti_vdj.nf index 7cfda04b..22bee092 100644 --- a/subworkflows/local/align_cellrangermulti_vdj.nf +++ b/subworkflows/local/align_cellrangermulti_vdj.nf @@ -192,7 +192,7 @@ workflow CELLRANGER_MULTI_ALIGN_VDJ { [], [], [], - ch_frna_sample_csv, // currently not implemented nor tested + [], // TODO params.skip_cellranger_renaming ) ch_versions = ch_versions.mix(CELLRANGER_MULTI_IMMUNE.out.versions) From cef87597ef59e5d4d45e42e49ebe99470ccafe15 Mon Sep 17 00:00:00 2001 From: Helle Rus Povlsen Date: Fri, 4 Oct 2024 15:09:57 +0200 Subject: [PATCH 22/22] updated bamtofastq10x module --- modules.json | 2 +- modules/nf-core/bamtofastq10x/environment.yml | 2 - modules/nf-core/bamtofastq10x/main.nf | 4 +- modules/nf-core/bamtofastq10x/meta.yml | 56 ++++++++++--------- .../bamtofastq10x/tests/main.nf.test.snap | 22 +++----- 5 files changed, 42 insertions(+), 44 deletions(-) diff --git a/modules.json b/modules.json index e2d946c5..63bd6c80 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "bamtofastq10x": { "branch": "master", - "git_sha": "63d6994f4f85c0628b7f2ac1e7097136c1b4be34", + "git_sha": "2d82007b83328343d0e2a9cec087b628eef7e3d1", "installed_by": ["modules"] }, "cellranger/count": { diff --git a/modules/nf-core/bamtofastq10x/environment.yml b/modules/nf-core/bamtofastq10x/environment.yml index d612f512..cce34f4b 100644 --- a/modules/nf-core/bamtofastq10x/environment.yml +++ b/modules/nf-core/bamtofastq10x/environment.yml @@ -1,9 +1,7 @@ --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -name: "bamtofastq10x" channels: - conda-forge - bioconda - - defaults dependencies: - "bioconda::10x_bamtofastq=1.4.1" diff --git a/modules/nf-core/bamtofastq10x/main.nf b/modules/nf-core/bamtofastq10x/main.nf index 904d6c14..bff379c1 100644 --- a/modules/nf-core/bamtofastq10x/main.nf +++ b/modules/nf-core/bamtofastq10x/main.nf @@ -26,7 +26,6 @@ process BAMTOFASTQ10X { $bam \\ $prefix - cat <<-END_VERSIONS > versions.yml "${task.process}": bamtofastq10x: \$(bamtofastq --version |& sed '1!d ; s/bamtofastq //') @@ -36,7 +35,8 @@ process BAMTOFASTQ10X { stub: def prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${prefix}.fastq.gz + mkdir -p ${prefix}/bamtofastq10x + touch ${prefix}/bamtofastq10x/bamtofastq.fastq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/bamtofastq10x/meta.yml b/modules/nf-core/bamtofastq10x/meta.yml index 2ac31a5c..f3e5dfca 100644 --- a/modules/nf-core/bamtofastq10x/meta.yml +++ b/modules/nf-core/bamtofastq10x/meta.yml @@ -1,6 +1,8 @@ name: bamtofastq10x -description: Tool for converting 10x BAMs produced by Cell Ranger, Space Ranger, Cell Ranger ATAC, Cell Ranger DNA, and Long Ranger back to FASTQ files that can be used as inputs to re-run analysis +description: Tool for converting 10x BAMs produced by Cell Ranger, Space Ranger, Cell + Ranger ATAC, Cell Ranger DNA, and Long Ranger back to FASTQ files that can be used + as inputs to re-run analysis keywords: - bam @@ -10,38 +12,40 @@ keywords: tools: - bamtofastq10x: - description: Tool for converting 10x BAMs produced by Cell Ranger, Space Ranger, Cell Ranger ATAC, Cell Ranger DNA, and Long Ranger back to FASTQ files that can be used as inputs to re-run analysis + description: Tool for converting 10x BAMs produced by Cell Ranger, Space Ranger, + Cell Ranger ATAC, Cell Ranger DNA, and Long Ranger back to FASTQ files that + can be used as inputs to re-run analysis homepage: https://github.com/10XGenomics/bamtofastq documentation: https://github.com/10XGenomics/bamtofastq tool_dev_url: https://github.com/10XGenomics/bamtofastq licence: ["MIT"] + identifier: "" input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'sample1', single_end:false ]` - - bam: - type: file - description: BAM file - pattern: "*.bam" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - bam: + type: file + description: BAM file + pattern: "*.bam" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'sample1', single_end:false ]` - - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - fastq: - type: file - description: fastq compressed file - pattern: "*.fastq.gz" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "**/*.fastq.gz": + type: file + description: fastq compressed file + pattern: "**/*.fastq.gz" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@BlueBicycleBlog" maintainers: diff --git a/modules/nf-core/bamtofastq10x/tests/main.nf.test.snap b/modules/nf-core/bamtofastq10x/tests/main.nf.test.snap index d5346745..fd531091 100644 --- a/modules/nf-core/bamtofastq10x/tests/main.nf.test.snap +++ b/modules/nf-core/bamtofastq10x/tests/main.nf.test.snap @@ -8,11 +8,9 @@ "id": "test" }, [ - [ - "bamtofastq_S1_L000_I1_001.fastq.gz:md5,a33682ac881de7a7453d79721b7621a0", - "bamtofastq_S1_L000_R1_001.fastq.gz:md5,5ccebf77d8636d7a7cdfc59737aea79f", - "bamtofastq_S1_L000_R2_001.fastq.gz:md5,2ee7c90e4307deba74065cfd00a65002" - ] + "bamtofastq_S1_L000_I1_001.fastq.gz:md5,a33682ac881de7a7453d79721b7621a0", + "bamtofastq_S1_L000_R1_001.fastq.gz:md5,5ccebf77d8636d7a7cdfc59737aea79f", + "bamtofastq_S1_L000_R2_001.fastq.gz:md5,2ee7c90e4307deba74065cfd00a65002" ] ] ], @@ -25,11 +23,9 @@ "id": "test" }, [ - [ - "bamtofastq_S1_L000_I1_001.fastq.gz:md5,a33682ac881de7a7453d79721b7621a0", - "bamtofastq_S1_L000_R1_001.fastq.gz:md5,5ccebf77d8636d7a7cdfc59737aea79f", - "bamtofastq_S1_L000_R2_001.fastq.gz:md5,2ee7c90e4307deba74065cfd00a65002" - ] + "bamtofastq_S1_L000_I1_001.fastq.gz:md5,a33682ac881de7a7453d79721b7621a0", + "bamtofastq_S1_L000_R1_001.fastq.gz:md5,5ccebf77d8636d7a7cdfc59737aea79f", + "bamtofastq_S1_L000_R2_001.fastq.gz:md5,2ee7c90e4307deba74065cfd00a65002" ] ] ], @@ -39,9 +35,9 @@ } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.0" + "nf-test": "0.9.0", + "nextflow": "24.04.4" }, - "timestamp": "2024-05-22T16:43:24.999397" + "timestamp": "2024-10-02T12:56:21.808042" } } \ No newline at end of file