From 52fa9ca8bb94e6e8b564d03ec41f2a1ae08291ab Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 19 Aug 2024 11:22:53 +0200 Subject: [PATCH 01/66] comment out for development --- workflows/scrnaseq.nf | 62 +++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 10ced221..4c1ef914 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -290,37 +290,37 @@ workflow SCRNASEQ { } // Run emptydrops calling module - if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { - - // - // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it - // - if ( params.aligner in [ 'cellranger', 'cellrangermulti', 'kallisto', 'star' ] ) { - ch_mtx_matrices_for_emptydrops = - ch_mtx_matrices.filter { meta, mtx_files -> - mtx_files.toString().contains("raw_feature_bc_matrix") || // cellranger - mtx_files.toString().contains("counts_unfiltered") || // kallisto - mtx_files.toString().contains("raw") // star - } - } else { - ch_mtx_matrices_for_emptydrops = ch_mtx_matrices - } - - EMPTYDROPS_CELL_CALLING( ch_mtx_matrices_for_emptydrops ) - ch_mtx_matrices = ch_mtx_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) - - } - - // Run mtx to h5ad conversion subworkflow - MTX_CONVERSION ( - ch_mtx_matrices, - ch_input, - ch_txp2gene, - ch_star_index - ) - - //Add Versions from MTX Conversion workflow too - ch_versions.mix(MTX_CONVERSION.out.ch_versions) + // if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { + + // // + // // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it + // // + // if ( params.aligner in [ 'cellranger', 'cellrangermulti', 'kallisto', 'star' ] ) { + // ch_mtx_matrices_for_emptydrops = + // ch_mtx_matrices.filter { meta, mtx_files -> + // mtx_files.toString().contains("raw_feature_bc_matrix") || // cellranger + // mtx_files.toString().contains("counts_unfiltered") || // kallisto + // mtx_files.toString().contains("raw") // star + // } + // } else { + // ch_mtx_matrices_for_emptydrops = ch_mtx_matrices + // } + + // EMPTYDROPS_CELL_CALLING( ch_mtx_matrices_for_emptydrops ) + // ch_mtx_matrices = ch_mtx_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) + + // } + + // // Run mtx to h5ad conversion subworkflow + // MTX_CONVERSION ( + // ch_mtx_matrices, + // ch_input, + // ch_txp2gene, + // ch_star_index + // ) + + // //Add Versions from MTX Conversion workflow too + // ch_versions.mix(MTX_CONVERSION.out.ch_versions) // // Collate and save software versions From bbf299c5ea80a4cd03e3e9add5d6d761ece9a1a2 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 19 Aug 2024 12:19:37 +0200 Subject: [PATCH 02/66] refact modules for STAR aligner --- bin/mtx_to_h5ad_star.py | 89 +++++++++++++++++++++++++++++++ modules/local/mtx_to_h5ad_star.nf | 51 ++++++++++++++++++ subworkflows/local/starsolo.nf | 14 +++-- workflows/scrnaseq.nf | 3 -- 4 files changed, 150 insertions(+), 7 deletions(-) create mode 100755 bin/mtx_to_h5ad_star.py create mode 100644 modules/local/mtx_to_h5ad_star.nf diff --git a/bin/mtx_to_h5ad_star.py b/bin/mtx_to_h5ad_star.py new file mode 100755 index 00000000..a8c2ec22 --- /dev/null +++ b/bin/mtx_to_h5ad_star.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +# Set numba chache dir to current working directory (which is a writable mount also in containers) +import os + +os.environ["NUMBA_CACHE_DIR"] = "." + +import scanpy as sc +import pandas as pd +import argparse +from scipy import io +from anndata import AnnData + +def _mtx_to_adata( + mtx_file: str, + barcode_file: str, + feature_file: str, + sample: str, +): + adata = sc.read_mtx(mtx_file) + adata = adata.transpose() + adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values + adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values + adata.obs["sample"] = sample + + return adata + + +def input_to_adata( + input_data: str, + barcode_file: str, + feature_file: str, + sample: str, + star_index: str, +): + print("Reading in {}".format(input_data)) + + # open main data + adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample) + + # open gene information + txp2gene = "{}/geneInfo.tab".format(star_index) + print("Reading in {}".format(txp2gene)) + t2g = pd.read_table("{}".format(txp2gene), header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]) + t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") + + # standard format + adata.var.index = t2g.index.values.tolist() + adata.var["gene_symbol"] = t2g["gene_symbol"] + + return adata + +def dump_versions(task_process): + import pkg_resources + + with open("versions.yml", "w") as f: + f.write(f"{task_process}:\n\t") + f.write("\n\t".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set])) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.") + + parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.") + parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="") + parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="") + parser.add_argument("-s", "--sample", dest="sample", help="Sample name") + parser.add_argument("-o", "--out", dest="out", help="Output path.") + parser.add_argument("--task_process", dest="task_process", help="Task process name.") + parser.add_argument("--star_index", dest="star_index", help="Star index folder containing geneInfo.tab.", nargs="?", const="") + + args = vars(parser.parse_args()) + + # create the directory with the sample name + os.makedirs(os.path.dirname(args["out"]), exist_ok=True) + + adata = input_to_adata( + input_data=args["input_data"], + barcode_file=args["barcode"], + feature_file=args["feature"], + sample=args["sample"], + star_index=args["star_index"] + ) + + adata.write_h5ad(args["out"], compression="gzip") + + print("Wrote h5ad file to {}".format(args["out"])) + + dump_versions(task_process=args["task_process"]) diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/mtx_to_h5ad_star.nf new file mode 100644 index 00000000..ae42262f --- /dev/null +++ b/modules/local/mtx_to_h5ad_star.nf @@ -0,0 +1,51 @@ +process MTX_TO_H5AD_STAR { + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : + 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(inputs) + path star_index + + output: + tuple val(meta2), path("${meta.id}/*h5ad"), emit: h5ad + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Get a file to check input type. Some aligners bring arrays instead of a single file. + def input_to_check = (inputs instanceof String) ? inputs : inputs[0] + + // check input type of inputs + input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' + meta2 = meta + [input_type: input_type] + mtx_matrix = "${input_type}/matrix.mtx.gz" + barcodes_tsv = "${input_type}/barcodes.tsv.gz" + features_tsv = "${input_type}/features.tsv.gz" + + + """ + # convert file types + mtx_to_h5ad_star.py \\ + --task_process ${task.process} \\ + --sample ${meta.id} \\ + --input $mtx_matrix \\ + --barcode $barcodes_tsv \\ + --feature $features_tsv \\ + --star_index ${star_index} \\ + --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad + """ + + stub: + """ + mkdir ${meta.id} + touch ${meta.id}/${meta.id}_matrix.h5ad + touch versions.yml + """ +} diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 0c11acd1..19dc9221 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -1,5 +1,6 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ -include { STAR_ALIGN } from '../../modules/local/star_align' +include { STAR_ALIGN } from '../../modules/local/star_align' +include { MTX_TO_H5AD_STAR as MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad_star' /* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ include { GUNZIP } from '../../modules/nf-core/gunzip/main' @@ -53,14 +54,19 @@ workflow STARSOLO { ) ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) + /* + * Perform h5ad conversion + */ + MTX_TO_H5AD ( + STAR_ALIGN.out.raw_counts.mix( STAR_ALIGN.out.filtered_counts ), + star_index.map{ meta, index -> index } + ) + emit: ch_versions // get rid of meta for star index - star_index = star_index.map{ meta, index -> index } star_result = STAR_ALIGN.out.tab star_counts = STAR_ALIGN.out.counts - raw_counts = STAR_ALIGN.out.raw_counts - filtered_counts = STAR_ALIGN.out.filtered_counts for_multiqc = STAR_ALIGN.out.log_final.map{ meta, it -> it } } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 4c1ef914..e58978da 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -171,8 +171,6 @@ workflow SCRNASEQ { protocol_config.get('extra_args', ""), ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.raw_counts, STARSOLO.out.filtered_counts) - ch_star_index = STARSOLO.out.star_index ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) } @@ -186,7 +184,6 @@ workflow SCRNASEQ { protocol_config['protocol'] ) ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_matrices) ch_star_index = CELLRANGER_ALIGN.out.star_index ch_multiqc_files = ch_multiqc_files.mix(CELLRANGER_ALIGN.out.cellranger_out.map{ meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} From c3eb5eac38b09f916f9bb7a0091493053f4d5365 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 19 Aug 2024 12:42:17 +0200 Subject: [PATCH 03/66] directly pass txp2gene --- bin/mtx_to_h5ad_star.py | 7 +++---- modules/local/mtx_to_h5ad_star.nf | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/bin/mtx_to_h5ad_star.py b/bin/mtx_to_h5ad_star.py index a8c2ec22..0e388e61 100755 --- a/bin/mtx_to_h5ad_star.py +++ b/bin/mtx_to_h5ad_star.py @@ -31,7 +31,7 @@ def input_to_adata( barcode_file: str, feature_file: str, sample: str, - star_index: str, + txp2gene: str, ): print("Reading in {}".format(input_data)) @@ -39,7 +39,6 @@ def input_to_adata( adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample) # open gene information - txp2gene = "{}/geneInfo.tab".format(star_index) print("Reading in {}".format(txp2gene)) t2g = pd.read_table("{}".format(txp2gene), header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]) t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") @@ -67,7 +66,7 @@ def dump_versions(task_process): parser.add_argument("-s", "--sample", dest="sample", help="Sample name") parser.add_argument("-o", "--out", dest="out", help="Output path.") parser.add_argument("--task_process", dest="task_process", help="Task process name.") - parser.add_argument("--star_index", dest="star_index", help="Star index folder containing geneInfo.tab.", nargs="?", const="") + parser.add_argument("--txp2gene", dest="txp2gene", help="Star index folder containing geneInfo.tab.", nargs="?", const="") args = vars(parser.parse_args()) @@ -79,7 +78,7 @@ def dump_versions(task_process): barcode_file=args["barcode"], feature_file=args["feature"], sample=args["sample"], - star_index=args["star_index"] + txp2gene=args["txp2gene"] ) adata.write_h5ad(args["out"], compression="gzip") diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/mtx_to_h5ad_star.nf index ae42262f..35d6f2ef 100644 --- a/modules/local/mtx_to_h5ad_star.nf +++ b/modules/local/mtx_to_h5ad_star.nf @@ -29,7 +29,6 @@ process MTX_TO_H5AD_STAR { barcodes_tsv = "${input_type}/barcodes.tsv.gz" features_tsv = "${input_type}/features.tsv.gz" - """ # convert file types mtx_to_h5ad_star.py \\ @@ -38,7 +37,7 @@ process MTX_TO_H5AD_STAR { --input $mtx_matrix \\ --barcode $barcodes_tsv \\ --feature $features_tsv \\ - --star_index ${star_index} \\ + --txp2gene ${star_index}/geneInfo.tab \\ --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ From 4771ed1fa1dc7301a856bfe8edcc04a0f2d9c342 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 19 Aug 2024 13:02:50 +0200 Subject: [PATCH 04/66] simplify module lines --- modules/local/mtx_to_h5ad_star.nf | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/mtx_to_h5ad_star.nf index 35d6f2ef..2ad9d4f0 100644 --- a/modules/local/mtx_to_h5ad_star.nf +++ b/modules/local/mtx_to_h5ad_star.nf @@ -25,18 +25,15 @@ process MTX_TO_H5AD_STAR { // check input type of inputs input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' meta2 = meta + [input_type: input_type] - mtx_matrix = "${input_type}/matrix.mtx.gz" - barcodes_tsv = "${input_type}/barcodes.tsv.gz" - features_tsv = "${input_type}/features.tsv.gz" """ # convert file types mtx_to_h5ad_star.py \\ --task_process ${task.process} \\ --sample ${meta.id} \\ - --input $mtx_matrix \\ - --barcode $barcodes_tsv \\ - --feature $features_tsv \\ + --input ${input_type}/matrix.mtx.gz \\ + --barcode ${input_type}/barcodes.tsv.gz \\ + --feature ${input_type}/features.tsv.gz \\ --txp2gene ${star_index}/geneInfo.tab \\ --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ From af85f87512f06a1320db8083f111c23054ef473f Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 19 Aug 2024 13:03:53 +0200 Subject: [PATCH 05/66] emit h5ad on starsolo --- subworkflows/local/starsolo.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 19dc9221..1226ce5d 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -68,5 +68,6 @@ workflow STARSOLO { // get rid of meta for star index star_result = STAR_ALIGN.out.tab star_counts = STAR_ALIGN.out.counts + star_h5ad = MTX_TO_H5AD.out.h5ad for_multiqc = STAR_ALIGN.out.log_final.map{ meta, it -> it } } From d2a53861a73114c2656cad6999f08d74eb3a9e8b Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 19 Aug 2024 13:04:41 +0200 Subject: [PATCH 06/66] add versions emition --- subworkflows/local/starsolo.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 1226ce5d..8236632d 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -61,6 +61,7 @@ workflow STARSOLO { STAR_ALIGN.out.raw_counts.mix( STAR_ALIGN.out.filtered_counts ), star_index.map{ meta, index -> index } ) + ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) emit: From 0e5505197854b2da1271bf90bd2a5cf89357c7b8 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 19 Aug 2024 13:35:07 +0200 Subject: [PATCH 07/66] update module to use templates and cleanup way of priting versions --- bin/mtx_to_h5ad_star.py | 88 ----------------- modules/local/mtx_to_h5ad_star.nf | 12 +-- modules/local/templates/mtx_to_h5ad_star.py | 104 ++++++++++++++++++++ 3 files changed, 105 insertions(+), 99 deletions(-) delete mode 100755 bin/mtx_to_h5ad_star.py create mode 100755 modules/local/templates/mtx_to_h5ad_star.py diff --git a/bin/mtx_to_h5ad_star.py b/bin/mtx_to_h5ad_star.py deleted file mode 100755 index 0e388e61..00000000 --- a/bin/mtx_to_h5ad_star.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python - -# Set numba chache dir to current working directory (which is a writable mount also in containers) -import os - -os.environ["NUMBA_CACHE_DIR"] = "." - -import scanpy as sc -import pandas as pd -import argparse -from scipy import io -from anndata import AnnData - -def _mtx_to_adata( - mtx_file: str, - barcode_file: str, - feature_file: str, - sample: str, -): - adata = sc.read_mtx(mtx_file) - adata = adata.transpose() - adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values - adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values - adata.obs["sample"] = sample - - return adata - - -def input_to_adata( - input_data: str, - barcode_file: str, - feature_file: str, - sample: str, - txp2gene: str, -): - print("Reading in {}".format(input_data)) - - # open main data - adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample) - - # open gene information - print("Reading in {}".format(txp2gene)) - t2g = pd.read_table("{}".format(txp2gene), header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]) - t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") - - # standard format - adata.var.index = t2g.index.values.tolist() - adata.var["gene_symbol"] = t2g["gene_symbol"] - - return adata - -def dump_versions(task_process): - import pkg_resources - - with open("versions.yml", "w") as f: - f.write(f"{task_process}:\n\t") - f.write("\n\t".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set])) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.") - - parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.") - parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="") - parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="") - parser.add_argument("-s", "--sample", dest="sample", help="Sample name") - parser.add_argument("-o", "--out", dest="out", help="Output path.") - parser.add_argument("--task_process", dest="task_process", help="Task process name.") - parser.add_argument("--txp2gene", dest="txp2gene", help="Star index folder containing geneInfo.tab.", nargs="?", const="") - - args = vars(parser.parse_args()) - - # create the directory with the sample name - os.makedirs(os.path.dirname(args["out"]), exist_ok=True) - - adata = input_to_adata( - input_data=args["input_data"], - barcode_file=args["barcode"], - feature_file=args["feature"], - sample=args["sample"], - txp2gene=args["txp2gene"] - ) - - adata.write_h5ad(args["out"], compression="gzip") - - print("Wrote h5ad file to {}".format(args["out"])) - - dump_versions(task_process=args["task_process"]) diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/mtx_to_h5ad_star.nf index 2ad9d4f0..ded91538 100644 --- a/modules/local/mtx_to_h5ad_star.nf +++ b/modules/local/mtx_to_h5ad_star.nf @@ -26,17 +26,7 @@ process MTX_TO_H5AD_STAR { input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' meta2 = meta + [input_type: input_type] - """ - # convert file types - mtx_to_h5ad_star.py \\ - --task_process ${task.process} \\ - --sample ${meta.id} \\ - --input ${input_type}/matrix.mtx.gz \\ - --barcode ${input_type}/barcodes.tsv.gz \\ - --feature ${input_type}/features.tsv.gz \\ - --txp2gene ${star_index}/geneInfo.tab \\ - --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad - """ + template 'mtx_to_h5ad_star.py' stub: """ diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py new file mode 100755 index 00000000..88bd91ec --- /dev/null +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +# Set numba chache dir to current working directory (which is a writable mount also in containers) +import os + +os.environ["NUMBA_CACHE_DIR"] = "." + +import scanpy as sc +import pandas as pd +import argparse +from anndata import AnnData +import platform + +def _mtx_to_adata( + mtx_file: str, + barcode_file: str, + feature_file: str, + sample: str, +): + adata = sc.read_mtx(mtx_file) + adata = adata.transpose() + adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values + adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values + adata.obs["sample"] = sample + + return adata + + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def dump_versions(): + versions = { + "${task.process}": { + "python": platform.python_version(), + "scanpy": sc.__version__, + "pandas": pd.__version__ + } + } + + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) + +def input_to_adata( + input_data: str, + output: str, + barcode_file: str, + feature_file: str, + sample: str, + txp2gene: str, +): + print(f"Reading in {input_data}") + + # open main data + adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample) + + # open gene information + print(f"Reading in {txp2gene}") + t2g = pd.read_table(f"{txp2gene}", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]) + t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") + + # standard format + adata.var.index = t2g.index.values.tolist() + adata.var["gene_symbol"] = t2g["gene_symbol"] + + # write results + adata.write_h5ad(f"{output}", compression="gzip") + print(f"Wrote h5ad file to {output}") + + # dump versions + dump_versions() + + return adata + +# +# Run main script +# + +# create the directory with the sample name +os.makedirs("${meta.id}", exist_ok=True) + +# input_type comes from NF module +adata = input_to_adata( + input_data="${input_type}/matrix.mtx.gz", + output="${meta.id}/${meta.id}_${input_type}_matrix.h5ad", + barcode_file="${input_type}/barcodes.tsv.gz", + feature_file="${input_type}/features.tsv.gz", + sample="${meta.id}", + txp2gene="${star_index}/geneInfo.tab" +) From 994d5a530d969c864c9c9465ef2d09cd083d9f49 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Tue, 20 Aug 2024 12:41:36 +0200 Subject: [PATCH 08/66] fix h5ad generator script --- modules/local/mtx_to_h5ad_star.nf | 6 ++-- modules/local/templates/mtx_to_h5ad_star.py | 33 +++++++-------------- 2 files changed, 12 insertions(+), 27 deletions(-) diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/mtx_to_h5ad_star.nf index ded91538..1895cb87 100644 --- a/modules/local/mtx_to_h5ad_star.nf +++ b/modules/local/mtx_to_h5ad_star.nf @@ -2,10 +2,8 @@ process MTX_TO_H5AD_STAR { tag "$meta.id" label 'process_medium' - conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : - 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg" + container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538" input: tuple val(meta), path(inputs) diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py index 88bd91ec..d6889671 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -12,15 +12,10 @@ import platform def _mtx_to_adata( - mtx_file: str, - barcode_file: str, - feature_file: str, + input: str, sample: str, ): - adata = sc.read_mtx(mtx_file) - adata = adata.transpose() - adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values - adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values + adata = sc.read_10x_mtx(input) adata.obs["sample"] = sample return adata @@ -58,24 +53,19 @@ def dump_versions(): def input_to_adata( input_data: str, output: str, - barcode_file: str, - feature_file: str, sample: str, - txp2gene: str, ): print(f"Reading in {input_data}") # open main data - adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample) - - # open gene information - print(f"Reading in {txp2gene}") - t2g = pd.read_table(f"{txp2gene}", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]) - t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") + adata = _mtx_to_adata(input_data, sample) # standard format - adata.var.index = t2g.index.values.tolist() - adata.var["gene_symbol"] = t2g["gene_symbol"] + # index are gene IDs and symbols are a column + adata.var["gene_symbol"] = adata.var.index + adata.var[['gene_ids', 'gene_versions']] = adata.var["gene_ids"].apply(lambda x: pd.Series(str(x).split("."))) + adata.var.index = adata.var["gene_ids"].values + adata.var = adata.var.drop("gene_ids", axis=1) # write results adata.write_h5ad(f"{output}", compression="gzip") @@ -95,10 +85,7 @@ def input_to_adata( # input_type comes from NF module adata = input_to_adata( - input_data="${input_type}/matrix.mtx.gz", + input_data="${input_type}", output="${meta.id}/${meta.id}_${input_type}_matrix.h5ad", - barcode_file="${input_type}/barcodes.tsv.gz", - feature_file="${input_type}/features.tsv.gz", - sample="${meta.id}", - txp2gene="${star_index}/geneInfo.tab" + sample="${meta.id}" ) From b317165c43e47999f78198df51f37059e8fd1388 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Tue, 20 Aug 2024 12:43:01 +0200 Subject: [PATCH 09/66] simplify check --- modules/local/mtx_to_h5ad_star.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/mtx_to_h5ad_star.nf index 1895cb87..84474ed0 100644 --- a/modules/local/mtx_to_h5ad_star.nf +++ b/modules/local/mtx_to_h5ad_star.nf @@ -21,7 +21,7 @@ process MTX_TO_H5AD_STAR { def input_to_check = (inputs instanceof String) ? inputs : inputs[0] // check input type of inputs - input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' + input_type = (input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' meta2 = meta + [input_type: input_type] template 'mtx_to_h5ad_star.py' From 72e9d50f91e8fb8fc870b9ed223f4bcc8fc77211 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Wed, 21 Aug 2024 09:23:17 +0200 Subject: [PATCH 10/66] Fix h5ad structure --- modules/local/templates/mtx_to_h5ad_star.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py index d6889671..48749114 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -63,7 +63,8 @@ def input_to_adata( # standard format # index are gene IDs and symbols are a column adata.var["gene_symbol"] = adata.var.index - adata.var[['gene_ids', 'gene_versions']] = adata.var["gene_ids"].apply(lambda x: pd.Series(str(x).split("."))) + adata.var['gene_versions'] = adata.var["gene_ids"] + adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] adata.var.index = adata.var["gene_ids"].values adata.var = adata.var.drop("gene_ids", axis=1) From 882908024464e740fa5dd271ff78b9574450f76e Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 30 Aug 2024 10:49:57 +0200 Subject: [PATCH 11/66] updated concat module --- modules/local/concat_h5ad.nf | 15 ++----- .../local/templates}/concat_h5ad.py | 25 +++-------- subworkflows/local/mtx_conversion.nf | 31 +++---------- workflows/scrnaseq.nf | 45 +++++++------------ 4 files changed, 30 insertions(+), 86 deletions(-) rename {bin => modules/local/templates}/concat_h5ad.py (53%) diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf index cd08cbbe..37b14156 100644 --- a/modules/local/concat_h5ad.nf +++ b/modules/local/concat_h5ad.nf @@ -1,13 +1,11 @@ process CONCAT_H5AD { label 'process_medium' - conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : - 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg" + container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538" input: - tuple val(input_type), path(h5ad) + tuple val(meta), path(h5ad) path samplesheet output: @@ -17,12 +15,7 @@ process CONCAT_H5AD { task.ext.when == null || task.ext.when script: - """ - concat_h5ad.py \\ - --input $samplesheet \\ - --out combined_${input_type}_matrix.h5ad \\ - --suffix "_matrix.h5ad" - """ + template 'concat_h5ad.py' stub: """ diff --git a/bin/concat_h5ad.py b/modules/local/templates/concat_h5ad.py similarity index 53% rename from bin/concat_h5ad.py rename to modules/local/templates/concat_h5ad.py index 43ea071a..033bc89a 100755 --- a/bin/concat_h5ad.py +++ b/modules/local/templates/concat_h5ad.py @@ -7,7 +7,6 @@ import scanpy as sc, anndata as ad, pandas as pd from pathlib import Path -import argparse def read_samplesheet(samplesheet): @@ -17,36 +16,24 @@ def read_samplesheet(samplesheet): # samplesheet may contain replicates, when it has, # group information from replicates and collapse with commas # only keep unique values using set() - df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column))) + df = df.groupby(["sample"]).agg(lambda column: ",".join(set(str(column)))) return df if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Concatenates h5ad files and merge metadata from samplesheet") - - parser.add_argument("-i", "--input", dest="input", help="Path to samplesheet.csv") - parser.add_argument("-o", "--out", dest="out", help="Output path.") - parser.add_argument( - "-s", - "--suffix", - dest="suffix", - help="Suffix of matrices to remove and get sample name", - ) - - args = vars(parser.parse_args()) # Open samplesheet as dataframe - df_samplesheet = read_samplesheet(args["input"]) + df_samplesheet = read_samplesheet("${samplesheet}") # find all h5ad and append to dict - dict_of_h5ad = {str(path).replace(args["suffix"], ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")} + dict_of_h5ad = {str(path).replace("_matrix.h5ad", ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")} # concat h5ad files adata = ad.concat(dict_of_h5ad, label="sample", merge="unique", index_unique="_") # merge with data.frame, on sample information - adata.obs = adata.obs.join(df_samplesheet, on="sample") - adata.write_h5ad(args["out"], compression="gzip") + adata.obs = adata.obs.join(df_samplesheet, on="sample").astype(str) + adata.write_h5ad("combined_${meta.input_type}_matrix.h5ad", compression="gzip") - print("Wrote h5ad file to {}".format(args["out"])) + print("Wrote h5ad file to {}".format("combined_${meta.input_type}_matrix.h5ad")) diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 98e49a2e..0d076db0 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -8,35 +8,14 @@ workflow MTX_CONVERSION { take: mtx_matrices samplesheet - txp2gene - star_index main: ch_versions = Channel.empty() - // Cellranger module output contains too many files which cause path collisions, we filter to the ones we need. - // Keeping backwards compatibility with cellranger-arc. - // TODO: Adapt cellranger-arc subworkflow like cellranger to remove this snippet here. - if (params.aligner in [ 'cellrangerarc' ]) { - mtx_matrices = mtx_matrices.map { meta, mtx_files -> - [ meta, mtx_files.findAll { it.toString().contains("filtered_feature_bc_matrix") } ] - } - .filter { meta, mtx_files -> mtx_files } // Remove any that are missing the relevant files - } - - // - // Convert matrix to h5ad - // - MTX_TO_H5AD ( - mtx_matrices, - txp2gene, - star_index - ) - // // Concat sample-specific h5ad in one // - ch_concat_h5ad_input = MTX_TO_H5AD.out.h5ad.groupTuple() // gather all sample-specific files / per type + ch_concat_h5ad_input = mtx_matrices.groupTuple() // gather all sample-specific files / per type if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') { // when having spliced / unspliced matrices, the collected tuple has two levels ( [[mtx_1, mtx_2]] ) // which nextflow break because it is not a valid 'path' thus, we have to remove one level @@ -51,12 +30,12 @@ workflow MTX_CONVERSION { // // Convert matrix do seurat // - MTX_TO_SEURAT ( - mtx_matrices - ) + // MTX_TO_SEURAT ( + // mtx_matrices + // ) //TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output - ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions) + // ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions) emit: ch_versions diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index e58978da..333699a7 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -86,8 +86,8 @@ workflow SCRNASEQ { ch_multi_samplesheet = params.cellranger_multi_barcodes ? file(params.cellranger_multi_barcodes, checkIfExists: true) : [] empty_file = file("$projectDir/assets/EMPTY", checkIfExists: true) - ch_versions = Channel.empty() - ch_mtx_matrices = Channel.empty() + ch_versions = Channel.empty() + ch_h5ad_matrices = Channel.empty() // Run FastQC if (!params.skip_fastqc) { @@ -137,7 +137,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.raw_counts, KALLISTO_BUSTOOLS.out.filtered_counts) + ch_h5ad_matrices = ch_h5ad_matrices.mix(KALLISTO_BUSTOOLS.out.raw_counts, KALLISTO_BUSTOOLS.out.filtered_counts) ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } @@ -155,7 +155,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_ALEVIN.out.alevin_results.map{ meta, it -> it }) - ch_mtx_matrices = ch_mtx_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_results) + ch_h5ad_matrices = ch_h5ad_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_results) } // Run STARSolo pipeline @@ -172,6 +172,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) + ch_h5ad_matrices = STARSOLO.out.star_h5ad } // Run cellranger pipeline @@ -200,7 +201,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(UNIVERSC_ALIGN.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(UNIVERSC_ALIGN.out.universc_out) + ch_h5ad_matrices = ch_h5ad_matrices.mix(UNIVERSC_ALIGN.out.universc_out) } // Run cellrangerarc pipeline @@ -214,7 +215,7 @@ workflow SCRNASEQ { ch_cellrangerarc_config ) ch_versions = ch_versions.mix(CELLRANGERARC_ALIGN.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) + ch_h5ad_matrices = ch_h5ad_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) } // Run cellrangermulti pipeline @@ -282,39 +283,23 @@ workflow SCRNASEQ { ch_multiqc_files = ch_multiqc_files.mix( CELLRANGER_MULTI_ALIGN.out.cellrangermulti_out.map{ meta, outs -> outs.findAll{ it -> it.name == "web_summary.html" } }) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) + ch_h5ad_matrices = ch_h5ad_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) } // Run emptydrops calling module // if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { - // // - // // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it - // // - // if ( params.aligner in [ 'cellranger', 'cellrangermulti', 'kallisto', 'star' ] ) { - // ch_mtx_matrices_for_emptydrops = - // ch_mtx_matrices.filter { meta, mtx_files -> - // mtx_files.toString().contains("raw_feature_bc_matrix") || // cellranger - // mtx_files.toString().contains("counts_unfiltered") || // kallisto - // mtx_files.toString().contains("raw") // star - // } - // } else { - // ch_mtx_matrices_for_emptydrops = ch_mtx_matrices - // } - - // EMPTYDROPS_CELL_CALLING( ch_mtx_matrices_for_emptydrops ) - // ch_mtx_matrices = ch_mtx_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) + // EMPTYDROPS_CELL_CALLING( ch_h5ad_matrices.filter{ it[0].input_type == 'raw' } ) + // ch_h5ad_matrices = ch_h5ad_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) // } - // // Run mtx to h5ad conversion subworkflow - // MTX_CONVERSION ( - // ch_mtx_matrices, - // ch_input, - // ch_txp2gene, - // ch_star_index - // ) + // Run mtx to h5ad conversion subworkflow + MTX_CONVERSION ( + ch_h5ad_matrices, + ch_input + ) // //Add Versions from MTX Conversion workflow too // ch_versions.mix(MTX_CONVERSION.out.ch_versions) From 9fc75b482b6142a6e76eeb8d82bb11814024d059 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 30 Aug 2024 10:59:25 +0200 Subject: [PATCH 12/66] workflow misses emptydrops and seurat & mtx conversion modules --- workflows/scrnaseq.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 333699a7..10569c35 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -301,8 +301,8 @@ workflow SCRNASEQ { ch_input ) - // //Add Versions from MTX Conversion workflow too - // ch_versions.mix(MTX_CONVERSION.out.ch_versions) + //Add Versions from MTX Conversion workflow too + ch_versions.mix(MTX_CONVERSION.out.ch_versions) // // Collate and save software versions From dc69f47f1ae8c080fdad3084ebc9cedea5a13ede Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Thu, 12 Sep 2024 13:55:14 +0200 Subject: [PATCH 13/66] start emptydrops cellbender subworkflow --- modules.json | 5 + modules/adata_barcodes.nf | 23 ++ modules/local/templates/barcodes.py | 44 ++++ .../removebackground/environment.yml | 5 + .../cellbender/removebackground/main.nf | 65 ++++++ .../cellbender/removebackground/meta.yml | 75 +++++++ .../removebackground/tests/epochs.config | 6 + .../removebackground/tests/main.nf.test | 66 ++++++ .../removebackground/tests/main.nf.test.snap | 196 ++++++++++++++++++ .../removebackground/tests/tags.yml | 2 + subworkflows/local/emptydrops_removal.nf | 25 +++ subworkflows/local/mtx_conversion.nf | 7 +- workflows/scrnaseq.nf | 9 - 13 files changed, 516 insertions(+), 12 deletions(-) create mode 100644 modules/adata_barcodes.nf create mode 100644 modules/local/templates/barcodes.py create mode 100644 modules/nf-core/cellbender/removebackground/environment.yml create mode 100644 modules/nf-core/cellbender/removebackground/main.nf create mode 100644 modules/nf-core/cellbender/removebackground/meta.yml create mode 100644 modules/nf-core/cellbender/removebackground/tests/epochs.config create mode 100644 modules/nf-core/cellbender/removebackground/tests/main.nf.test create mode 100644 modules/nf-core/cellbender/removebackground/tests/main.nf.test.snap create mode 100644 modules/nf-core/cellbender/removebackground/tests/tags.yml create mode 100644 subworkflows/local/emptydrops_removal.nf diff --git a/modules.json b/modules.json index aa186d98..b9680cfe 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "cellbender/removebackground": { + "branch": "master", + "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48", + "installed_by": ["modules"] + }, "cellranger/count": { "branch": "master", "git_sha": "90dad5491658049282ceb287a3d7732c1ce39837", diff --git a/modules/adata_barcodes.nf b/modules/adata_barcodes.nf new file mode 100644 index 00000000..2aef8ec9 --- /dev/null +++ b/modules/adata_barcodes.nf @@ -0,0 +1,23 @@ +process ADATA_BARCODES { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/anndata:0.10.7--e9840a94592528c8': + 'community.wave.seqera.io/library/anndata:0.10.7--336c6c1921a0632b' }" + + input: + tuple val(meta), path(h5ad), path(barcodes_csv) + + output: + tuple val(meta), path("*.h5ad"), emit: h5ad + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + template 'barcodes.py' +} diff --git a/modules/local/templates/barcodes.py b/modules/local/templates/barcodes.py new file mode 100644 index 00000000..8a9b10a7 --- /dev/null +++ b/modules/local/templates/barcodes.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import platform +import anndata as ad +import pandas as pd + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +df = pd.read_csv("${barcodes_csv}", header=None) +adata = ad.read_h5ad("${h5ad}") + +adata = adata[df[0].values] + +adata.write_h5ad("${prefix}.h5ad") + +# Versions + +versions = { + "${task.process}": { + "python": platform.python_version(), + "anndata": ad.__version__, + "pandas": pd.__version__ + } +} + +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/nf-core/cellbender/removebackground/environment.yml b/modules/nf-core/cellbender/removebackground/environment.yml new file mode 100644 index 00000000..a157c522 --- /dev/null +++ b/modules/nf-core/cellbender/removebackground/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cellbender=0.3.0 diff --git a/modules/nf-core/cellbender/removebackground/main.nf b/modules/nf-core/cellbender/removebackground/main.nf new file mode 100644 index 00000000..f3cfd1ff --- /dev/null +++ b/modules/nf-core/cellbender/removebackground/main.nf @@ -0,0 +1,65 @@ +process CELLBENDER_REMOVEBACKGROUND { + tag "$meta.id" + label 'process_medium' + label 'process_gpu' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/cellbender:0.3.0--c4addb97ab2d83fe': + 'community.wave.seqera.io/library/cellbender:0.3.0--41318a055fc3aacb' }" + + input: + tuple val(meta), path(h5ad) + + output: + tuple val(meta), path("${prefix}.h5") , emit: h5 + tuple val(meta), path("${prefix}_filtered.h5") , emit: filtered_h5 + tuple val(meta), path("${prefix}_posterior.h5") , emit: posterior_h5 + tuple val(meta), path("${prefix}_cell_barcodes.csv"), emit: barcodes + tuple val(meta), path("${prefix}_metrics.csv") , emit: metrics + tuple val(meta), path("${prefix}_report.html") , emit: report + tuple val(meta), path("${prefix}.pdf") , emit: pdf + tuple val(meta), path("${prefix}.log") , emit: log + tuple val(meta), path("ckpt.tar.gz") , emit: checkpoint + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + args = task.ext.args ?: "" + use_gpu = task.ext.use_gpu ? "--cuda" : "" + """ + TMPDIR=. cellbender remove-background \ + ${args} \ + --cpu-threads ${task.cpus} \ + ${use_gpu} \ + --input ${h5ad} \ + --output ${prefix}.h5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellbender: \$(cellbender --version) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch "${prefix}.h5" + touch "${prefix}_filtered.h5" + touch "${prefix}_posterior.h5" + touch "${prefix}_cell_barcodes.csv" + touch "${prefix}_metrics.csv" + touch "${prefix}_report.html" + touch "${prefix}.pdf" + touch "${prefix}.log" + touch "ckpt.tar.gz" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellbender: \$(cellbender --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellbender/removebackground/meta.yml b/modules/nf-core/cellbender/removebackground/meta.yml new file mode 100644 index 00000000..d70fa3fd --- /dev/null +++ b/modules/nf-core/cellbender/removebackground/meta.yml @@ -0,0 +1,75 @@ +name: cellbender_removebackground +description: Module to use CellBender to estimate ambient RNA from single-cell RNA-seq data +keywords: + - single-cell + - scRNA-seq + - ambient RNA removal +tools: + - cellbender: + description: CellBender is a software package for eliminating technical artifacts from high-throughput single-cell RNA sequencing (scRNA-seq) data. + documentation: https://cellbender.readthedocs.io/en/latest/ + tool_dev_url: https://github.com/broadinstitute/CellBender + licence: ["BSD-3-Clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - h5ad: + type: file + description: AnnData file containing unfiltered data (with empty droplets) + pattern: "*.h5ad" +output: + - h5: + type: file + description: Full count matrix as an h5 file, with background RNA removed. This file contains all the original droplet barcodes. + pattern: "*.h5" + - filtered_h5: + type: file + description: | + Full count matrix as an h5 file, with background RNA removed. This file contains only the droplet barcodes which were determined to have a > 50% posterior probability of containing cells. + pattern: "*.h5" + - posterior_h5: + type: file + description: | + The full posterior probability of noise counts. This is not normally used downstream. + pattern: "*.h5" + - barcodes: + type: file + description: | + CSV file containing all the droplet barcodes which were determined to have a > 50% posterior probability of containing cells. | + Barcodes are written in plain text. This information is also contained in each of the above outputs, | + but is included as a separate output for convenient use in certain downstream applications. + pattern: "*.csv" + - metrics: + type: file + description: | + Metrics describing the run, potentially to be used to flag problematic runs | + when using CellBender as part of a large-scale automated pipeline. + pattern: "*.csv" + - report: + type: file + description: | + HTML report including plots and commentary, along with any warnings or suggestions for improved parameter settings. + pattern: "*.html" + - pdf: + type: file + description: PDF file that provides a standard graphical summary of the inference procedure. + pattern: "*.pdf" + - log: + type: file + description: Log file produced by the cellbender remove-background run. + pattern: "*.log" + - checkpoint: + type: file + description: Checkpoint file which contains the trained model and the full posterior. + pattern: "*.ckpt" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@nictru" +maintainers: + - "@nictru" diff --git a/modules/nf-core/cellbender/removebackground/tests/epochs.config b/modules/nf-core/cellbender/removebackground/tests/epochs.config new file mode 100644 index 00000000..96282b07 --- /dev/null +++ b/modules/nf-core/cellbender/removebackground/tests/epochs.config @@ -0,0 +1,6 @@ + +process { + withName: CELLBENDER_REMOVEBACKGROUND { + ext.args = '--epochs 20' + } +} diff --git a/modules/nf-core/cellbender/removebackground/tests/main.nf.test b/modules/nf-core/cellbender/removebackground/tests/main.nf.test new file mode 100644 index 00000000..1afa6f3b --- /dev/null +++ b/modules/nf-core/cellbender/removebackground/tests/main.nf.test @@ -0,0 +1,66 @@ +nextflow_process { + name 'Test Process CELLBENDER_REMOVEBACKGROUND' + script '../main.nf' + process 'CELLBENDER_REMOVEBACKGROUND' + + tag "modules" + tag "modules_nfcore" + tag "cellbender/removebackground" + tag "cellbender" + + test("test_cellbender_removebackground") { + config './epochs.config' + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file("https://raw.githubusercontent.com/nf-core/test-datasets/scdownstream/samples/SAMN14430799_raw_matrix_5k.h5ad", checkIfExists: true) + ] + """ + } + } + then { + assertAll( + {assert process.success}, + {assert file(process.out.h5.get(0).get(1)).exists()}, + {assert file(process.out.filtered_h5.get(0).get(1)).exists()}, + {assert file(process.out.posterior_h5.get(0).get(1)).exists()}, + {assert snapshot(process.out.barcodes).match("cellbender_removebackground_barcodes")}, + {assert snapshot(process.out.metrics).match("cellbender_removebackground_metrics")}, + {assert file(process.out.report.get(0).get(1)).exists()}, + {assert file(process.out.pdf.get(0).get(1)).exists()}, + {assert file(process.out.log.get(0).get(1)).exists()}, + {assert snapshot(process.out.versions).match("cellbender_removebackground_versions")} + ) + } + } + + test("test_cellbender_removebackground - stub") { + options '-stub' + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file("https://raw.githubusercontent.com/nf-core/test-datasets/scdownstream/samples/SAMN14430799_raw_matrix_5k.h5ad", checkIfExists: true) + ] + """ + } + } + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out.h5).match("cellbender_removebackground_h5_stub")}, + {assert snapshot(process.out.filtered_h5).match("cellbender_removebackground_filtered_h5_stub")}, + {assert snapshot(process.out.posterior_h5).match("cellbender_removebackground_posterior_h5_stub")}, + {assert snapshot(process.out.barcodes).match("cellbender_removebackground_barcodes_stub")}, + {assert snapshot(process.out.metrics).match("cellbender_removebackground_metrics_stub")}, + {assert snapshot(process.out.report).match("cellbender_removebackground_report_stub")}, + {assert snapshot(process.out.pdf).match("cellbender_removebackground_pdf_stub")}, + {assert snapshot(process.out.log).match("cellbender_removebackground_log_stub")}, + {assert snapshot(process.out.versions).match("cellbender_removebackground_versions_stub")} + ) + } + } +} diff --git a/modules/nf-core/cellbender/removebackground/tests/main.nf.test.snap b/modules/nf-core/cellbender/removebackground/tests/main.nf.test.snap new file mode 100644 index 00000000..fdb51d66 --- /dev/null +++ b/modules/nf-core/cellbender/removebackground/tests/main.nf.test.snap @@ -0,0 +1,196 @@ +{ + "cellbender_removebackground_versions": { + "content": [ + [ + "versions.yml:md5,b236ac7595dfa6cd4d51ac73e51cb05a" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:09.33127881" + }, + "cellbender_removebackground_filtered_h5_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_filtered.h5:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.833598082" + }, + "cellbender_removebackground_pdf_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.pdf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.891829278" + }, + "cellbender_removebackground_metrics": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_metrics.csv:md5,88272bde1c157528b0b0ab2abe5ad26f" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:09.327155805" + }, + "cellbender_removebackground_versions_stub": { + "content": [ + [ + "versions.yml:md5,b236ac7595dfa6cd4d51ac73e51cb05a" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.904614838" + }, + "cellbender_removebackground_h5_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.h5:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.829304361" + }, + "cellbender_removebackground_metrics_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_metrics.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.870469733" + }, + "cellbender_removebackground_log_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.899293304" + }, + "cellbender_removebackground_barcodes": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_cell_barcodes.csv:md5,c8e8df9d0f9aea976d6f6aa36d329429" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:09.316098811" + }, + "cellbender_removebackground_report_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.885307244" + }, + "cellbender_removebackground_posterior_h5_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_posterior.h5:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.838032754" + }, + "cellbender_removebackground_barcodes_stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_cell_barcodes.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-12T13:41:20.861284979" + } +} \ No newline at end of file diff --git a/modules/nf-core/cellbender/removebackground/tests/tags.yml b/modules/nf-core/cellbender/removebackground/tests/tags.yml new file mode 100644 index 00000000..d935083b --- /dev/null +++ b/modules/nf-core/cellbender/removebackground/tests/tags.yml @@ -0,0 +1,2 @@ +cellbender/removebackground: + - modules/nf-core/cellbender/removebackground/** diff --git a/subworkflows/local/emptydrops_removal.nf b/subworkflows/local/emptydrops_removal.nf new file mode 100644 index 00000000..202612d6 --- /dev/null +++ b/subworkflows/local/emptydrops_removal.nf @@ -0,0 +1,25 @@ +include { CELLBENDER_REMOVEBACKGROUND } from '../../modules/nf-core/cellbender/removebackground' +include { ADATA_BARCODES } from '../../modules/local/adata_barcodes' + +workflow EMPTY_DROPLET_REMOVAL { + take: + ch_unfiltered + + main: + ch_versions = Channel.empty() + + CELLBENDER_REMOVEBACKGROUND(ch_unfiltered) + ch_versions = ch_versions.mix(CELLBENDER_REMOVEBACKGROUND.out.versions) + + ch_combined = ch_unfiltered.join(CELLBENDER_REMOVEBACKGROUND.out.barcodes) + + ADATA_BARCODES(ch_combined) + ch_versions = ch_versions.mix(ADATA_BARCODES.out.versions) + + ch_h5ad = ADATA_BARCODES.out.h5ad + + emit: + h5ad = ch_h5ad + + versions = ch_versions +} diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 0d076db0..7cca836d 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -1,7 +1,8 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ -include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad.nf' -include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' -include { MTX_TO_SEURAT } from '../../modules/local/mtx_to_seurat.nf' +include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad.nf' +include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' +include { MTX_TO_SEURAT } from '../../modules/local/mtx_to_seurat.nf' +include { EMPTY_DROPLET_REMOVAL } from '../subworkflows/local/emptydrops_removal' workflow MTX_CONVERSION { diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 10569c35..bc5eaef6 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -9,7 +9,6 @@ include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' -include { EMPTYDROPS_CELL_CALLING } from '../modules/local/emptydrops' include { GUNZIP as GUNZIP_FASTA } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_GTF } from '../modules/nf-core/gunzip/main' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -287,14 +286,6 @@ workflow SCRNASEQ { } - // Run emptydrops calling module - // if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { - - // EMPTYDROPS_CELL_CALLING( ch_h5ad_matrices.filter{ it[0].input_type == 'raw' } ) - // ch_h5ad_matrices = ch_h5ad_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) - - // } - // Run mtx to h5ad conversion subworkflow MTX_CONVERSION ( ch_h5ad_matrices, From 9c204200d0f86f7bc716343404bce2ad9e1c1636 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 10:57:43 +0200 Subject: [PATCH 14/66] fix paths --- modules/{ => local}/adata_barcodes.nf | 0 subworkflows/local/mtx_conversion.nf | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename modules/{ => local}/adata_barcodes.nf (100%) diff --git a/modules/adata_barcodes.nf b/modules/local/adata_barcodes.nf similarity index 100% rename from modules/adata_barcodes.nf rename to modules/local/adata_barcodes.nf diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 7cca836d..4d3ba721 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -2,7 +2,7 @@ include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad.nf' include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' include { MTX_TO_SEURAT } from '../../modules/local/mtx_to_seurat.nf' -include { EMPTY_DROPLET_REMOVAL } from '../subworkflows/local/emptydrops_removal' +include { EMPTY_DROPLET_REMOVAL } from './emptydrops_removal' workflow MTX_CONVERSION { From 683119598e67e5073f43b488e457a8f4d868f14f Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 12:46:51 +0200 Subject: [PATCH 15/66] started the anndatar standardization module --- modules/local/anndatar_convert.nf | 24 ++++++++++++++++++++++ modules/local/templates/anndatar_convert.R | 20 ++++++++++++++++++ subworkflows/local/mtx_conversion.nf | 9 ++++++++ 3 files changed, 53 insertions(+) create mode 100644 modules/local/anndatar_convert.nf create mode 100755 modules/local/templates/anndatar_convert.R diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf new file mode 100644 index 00000000..f3552ef0 --- /dev/null +++ b/modules/local/anndatar_convert.nf @@ -0,0 +1,24 @@ +process ANNDATAR_CONVERT { + label 'process_medium' + + container "fmalmeida/anndatar:dev" + + input: + tuple val(meta), path(h5ad) + + output: + tuple val(meta), path("${meta.id}_standardized.h5ad"), emit: h5ad + tuple val(meta), path("${meta.id}_standardized.Rds"), emit: rds + + when: + task.ext.when == null || task.ext.when + + script: + template 'anndatar_convert.R' + + stub: + """ + touch ${meta.id}_standardized.h5ad + touch ${meta.id}_standardized.Rds + """ +} diff --git a/modules/local/templates/anndatar_convert.R b/modules/local/templates/anndatar_convert.R new file mode 100755 index 00000000..016f8bb4 --- /dev/null +++ b/modules/local/templates/anndatar_convert.R @@ -0,0 +1,20 @@ +#!/usr/bin/env Rscript + +# to use nf variables: "${meta.id}" + +# load libraries +library(anndataR) + +# read input +adata <- read_h5ad("${h5ad}") + +# +# scope to perform standardization options +# + +# convert to Rds +obj <- adata\$to_Seurat() + +# save files +write_h5ad(adata, "${meta.id}_standardized.h5ad") +saveRDS(obj, file = "${meta.id}_standardized.Rds") diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 4d3ba721..6fc0ffb9 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -1,4 +1,5 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { ANNDATAR_CONVERT } from '../../modules/local/anndatar_convert' include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad.nf' include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' include { MTX_TO_SEURAT } from '../../modules/local/mtx_to_seurat.nf' @@ -13,6 +14,13 @@ workflow MTX_CONVERSION { main: ch_versions = Channel.empty() + // + // MODULE: Standardize h5ad and convert with AnndataR package + // + ANNDATAR_CONVERT ( + mtx_matrices + ) + // // Concat sample-specific h5ad in one // @@ -23,6 +31,7 @@ workflow MTX_CONVERSION { // making it as [ mtx_1, mtx_2 ] ch_concat_h5ad_input = ch_concat_h5ad_input.map{ type, matrices -> [ type, matrices.flatten().toList() ] } } + CONCAT_H5AD ( ch_concat_h5ad_input, samplesheet From 237d1ca3c65f1fc3b236e8d7ab8ac552e447779c Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 12:49:09 +0200 Subject: [PATCH 16/66] concat h5ad with anndatar h5ad --- subworkflows/local/mtx_conversion.nf | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 6fc0ffb9..0178868f 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -24,7 +24,7 @@ workflow MTX_CONVERSION { // // Concat sample-specific h5ad in one // - ch_concat_h5ad_input = mtx_matrices.groupTuple() // gather all sample-specific files / per type + ch_concat_h5ad_input = ANNDATAR_CONVERT.out.h5ad.groupTuple() // gather all sample-specific files / per type if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') { // when having spliced / unspliced matrices, the collected tuple has two levels ( [[mtx_1, mtx_2]] ) // which nextflow break because it is not a valid 'path' thus, we have to remove one level @@ -37,13 +37,6 @@ workflow MTX_CONVERSION { samplesheet ) - // - // Convert matrix do seurat - // - // MTX_TO_SEURAT ( - // mtx_matrices - // ) - //TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output // ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions) From f5fb4a27cb3b70211403e62fbc7ffc485ad9f64a Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 12:51:03 +0200 Subject: [PATCH 17/66] update tag information --- modules/local/anndatar_convert.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf index f3552ef0..3f985b64 100644 --- a/modules/local/anndatar_convert.nf +++ b/modules/local/anndatar_convert.nf @@ -1,4 +1,6 @@ process ANNDATAR_CONVERT { + tag "${meta.id}" + label 'process_medium' container "fmalmeida/anndatar:dev" From 752666bff34142a2bc2b6a947c5bad4a3c3ba236 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 12:58:07 +0200 Subject: [PATCH 18/66] update tags --- modules/local/concat_h5ad.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf index 37b14156..41310553 100644 --- a/modules/local/concat_h5ad.nf +++ b/modules/local/concat_h5ad.nf @@ -1,4 +1,6 @@ process CONCAT_H5AD { + tag "${meta.id}" + label 'process_medium' conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg" From d0ad7f6a90056903f2b7ee473232dcfc202523ff Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 12:59:19 +0200 Subject: [PATCH 19/66] module is only to convert to rds --- modules/local/anndatar_convert.nf | 4 +--- modules/local/templates/anndatar_convert.R | 5 ----- subworkflows/local/mtx_conversion.nf | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf index 3f985b64..dfe5fce9 100644 --- a/modules/local/anndatar_convert.nf +++ b/modules/local/anndatar_convert.nf @@ -3,13 +3,12 @@ process ANNDATAR_CONVERT { label 'process_medium' - container "fmalmeida/anndatar:dev" + container "fmalmeida/anndatar:dev" // TODO: Fix input: tuple val(meta), path(h5ad) output: - tuple val(meta), path("${meta.id}_standardized.h5ad"), emit: h5ad tuple val(meta), path("${meta.id}_standardized.Rds"), emit: rds when: @@ -20,7 +19,6 @@ process ANNDATAR_CONVERT { stub: """ - touch ${meta.id}_standardized.h5ad touch ${meta.id}_standardized.Rds """ } diff --git a/modules/local/templates/anndatar_convert.R b/modules/local/templates/anndatar_convert.R index 016f8bb4..479ac912 100755 --- a/modules/local/templates/anndatar_convert.R +++ b/modules/local/templates/anndatar_convert.R @@ -8,13 +8,8 @@ library(anndataR) # read input adata <- read_h5ad("${h5ad}") -# -# scope to perform standardization options -# - # convert to Rds obj <- adata\$to_Seurat() # save files -write_h5ad(adata, "${meta.id}_standardized.h5ad") saveRDS(obj, file = "${meta.id}_standardized.Rds") diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 0178868f..4e85a1a7 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -24,7 +24,7 @@ workflow MTX_CONVERSION { // // Concat sample-specific h5ad in one // - ch_concat_h5ad_input = ANNDATAR_CONVERT.out.h5ad.groupTuple() // gather all sample-specific files / per type + ch_concat_h5ad_input = mtx_matrices.groupTuple() // gather all sample-specific files / per type if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') { // when having spliced / unspliced matrices, the collected tuple has two levels ( [[mtx_1, mtx_2]] ) // which nextflow break because it is not a valid 'path' thus, we have to remove one level From 20ac4ae8a5eea676428c28945a684b55444cd8f9 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 13:37:19 +0200 Subject: [PATCH 20/66] update directives --- conf/modules.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 81395a1d..1f83f2a6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -45,12 +45,13 @@ process { } } - withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' { + withName: 'MTX_TO_H5AD*|CONCAT_H5AD|ANNDATAR_CONVERT' { publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions" }, mode: params.publish_dir_mode ] } + withName: 'GTF_GENE_FILTER' { publishDir = [ path: { "${params.outdir}/gtf_filter" }, From da5b0363dcfd972b317056cd9eae1fd431926658 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 13:39:05 +0200 Subject: [PATCH 21/66] update comments --- subworkflows/local/mtx_conversion.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 4e85a1a7..a2ff58c7 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -1,8 +1,6 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ -include { ANNDATAR_CONVERT } from '../../modules/local/anndatar_convert' -include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad.nf' include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' -include { MTX_TO_SEURAT } from '../../modules/local/mtx_to_seurat.nf' +include { ANNDATAR_CONVERT } from '../../modules/local/anndatar_convert' include { EMPTY_DROPLET_REMOVAL } from './emptydrops_removal' workflow MTX_CONVERSION { @@ -15,7 +13,7 @@ workflow MTX_CONVERSION { ch_versions = Channel.empty() // - // MODULE: Standardize h5ad and convert with AnndataR package + // MODULE: Convert to Rds with AnndataR package // ANNDATAR_CONVERT ( mtx_matrices From b90f3884770bddcea033d9e59829c9179d7ff391 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 14:28:05 +0200 Subject: [PATCH 22/66] add cellbender to workflow --- subworkflows/local/mtx_conversion.nf | 1 - workflows/scrnaseq.nf | 31 +++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index a2ff58c7..6435966d 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -1,7 +1,6 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' include { ANNDATAR_CONVERT } from '../../modules/local/anndatar_convert' -include { EMPTY_DROPLET_REMOVAL } from './emptydrops_removal' workflow MTX_CONVERSION { diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index bc5eaef6..1e7ea9ef 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -3,11 +3,12 @@ include { FASTQC_CHECK } from '../subworkflows/local/fastq include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' include { STARSOLO } from '../subworkflows/local/starsolo' -include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" -include { CELLRANGER_MULTI_ALIGN } from "../subworkflows/local/align_cellrangermulti" -include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" -include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" -include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" +include { CELLRANGER_ALIGN } from '../subworkflows/local/align_cellranger' +include { CELLRANGER_MULTI_ALIGN } from '../subworkflows/local/align_cellrangermulti' +include { CELLRANGERARC_ALIGN } from '../subworkflows/local/align_cellrangerarc' +include { UNIVERSC_ALIGN } from '../subworkflows/local/align_universc' +include { EMPTY_DROPLET_REMOVAL } from '../subworkflows/local/emptydrops_removal' +include { MTX_CONVERSION } from '../subworkflows/local/mtx_conversion' include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' include { GUNZIP as GUNZIP_FASTA } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_GTF } from '../modules/nf-core/gunzip/main' @@ -286,6 +287,26 @@ workflow SCRNASEQ { } + // SUBWORKFLOW: Run cellbender emptydrops filter + if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { + + // + // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it + // + if ( params.aligner in [ 'cellranger', 'cellrangermulti', 'kallisto', 'star' ] ) { + ch_h5ad_matrices_for_emptydrops = + ch_h5ad_matrices.filter { meta, mtx_files -> meta.input_type == 'raw' } + } else { + ch_h5ad_matrices_for_emptydrops = ch_h5ad_matrices + } + + EMPTY_DROPLET_REMOVAL ( + ch_h5ad_matrices_for_emptydrops + ) + // ch_h5ad_matrices = ch_h5ad_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) + + } + // Run mtx to h5ad conversion subworkflow MTX_CONVERSION ( ch_h5ad_matrices, From 7c304cc2c259709f5fd09921cafb630280fd9661 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 13 Sep 2024 14:41:58 +0200 Subject: [PATCH 23/66] start organisation of files --- conf/modules.config | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 1f83f2a6..8a53c285 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,15 +32,17 @@ process { } if (!params.skip_emptydrops) { - withName: EMPTYDROPS_CELL_CALLING { + withName: 'CELLBENDER_REMOVEBACKGROUND' { publishDir = [ - path: { "${params.outdir}/${params.aligner}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> - if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}" - else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}" - else "${meta.id}/${filename}" - } + path: { "${params.outdir}/${params.aligner}/${meta.id}/emptydrops_filter" }, + mode: params.publish_dir_mode + ] + } + withName: 'ADATA_BARCODES' { + ext.prefix = { "${meta.id}_custom_emptydrops_filter_matrix" } + publishDir = [ + path: { "${params.outdir}/${params.aligner}/mtx_conversions/${meta.id}" }, + mode: params.publish_dir_mode ] } } From 83204317227513880fc330429658e256761b93df Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Thu, 26 Sep 2024 09:25:25 +0200 Subject: [PATCH 24/66] fix file naming --- modules/local/anndatar_convert.nf | 4 ++-- modules/local/templates/anndatar_convert.R | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf index dfe5fce9..a3a67a08 100644 --- a/modules/local/anndatar_convert.nf +++ b/modules/local/anndatar_convert.nf @@ -9,7 +9,7 @@ process ANNDATAR_CONVERT { tuple val(meta), path(h5ad) output: - tuple val(meta), path("${meta.id}_standardized.Rds"), emit: rds + tuple val(meta), path("${meta.id}/${meta.id}_${meta.input_type}_matrix.Rds"), emit: rds when: task.ext.when == null || task.ext.when @@ -19,6 +19,6 @@ process ANNDATAR_CONVERT { stub: """ - touch ${meta.id}_standardized.Rds + touch ${meta.id}.Rds """ } diff --git a/modules/local/templates/anndatar_convert.R b/modules/local/templates/anndatar_convert.R index 479ac912..06a723d7 100755 --- a/modules/local/templates/anndatar_convert.R +++ b/modules/local/templates/anndatar_convert.R @@ -12,4 +12,5 @@ adata <- read_h5ad("${h5ad}") obj <- adata\$to_Seurat() # save files -saveRDS(obj, file = "${meta.id}_standardized.Rds") +dir.create(file.path("$meta.id"), showWarnings = FALSE) +saveRDS(obj, file = "${meta.id}/${meta.id}_${meta.input_type}_matrix.Rds") From 1c95e858d822dde703b4e25106dd3cae1226fb89 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Thu, 26 Sep 2024 09:36:57 +0200 Subject: [PATCH 25/66] resolve emptydrops naming --- conf/modules.config | 2 +- subworkflows/local/emptydrops_removal.nf | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8a53c285..ac26f731 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -39,7 +39,7 @@ process { ] } withName: 'ADATA_BARCODES' { - ext.prefix = { "${meta.id}_custom_emptydrops_filter_matrix" } + ext.prefix = { "${meta.id}_${meta.input_type}_matrix" } publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions/${meta.id}" }, mode: params.publish_dir_mode diff --git a/subworkflows/local/emptydrops_removal.nf b/subworkflows/local/emptydrops_removal.nf index 202612d6..7171c3f3 100644 --- a/subworkflows/local/emptydrops_removal.nf +++ b/subworkflows/local/emptydrops_removal.nf @@ -11,7 +11,15 @@ workflow EMPTY_DROPLET_REMOVAL { CELLBENDER_REMOVEBACKGROUND(ch_unfiltered) ch_versions = ch_versions.mix(CELLBENDER_REMOVEBACKGROUND.out.versions) - ch_combined = ch_unfiltered.join(CELLBENDER_REMOVEBACKGROUND.out.barcodes) + ch_combined = + ch_unfiltered + .join(CELLBENDER_REMOVEBACKGROUND.out.barcodes) + .map { meta, h5ad, csv -> + def meta_clone = meta.clone() + meta_clone.input_type = 'emptydrops_filter' + + [ meta_clone, h5ad, csv ] + } ADATA_BARCODES(ch_combined) ch_versions = ch_versions.mix(ADATA_BARCODES.out.versions) From e6fff346698a321e8305d0ae6f78e58adcfd4b41 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Thu, 26 Sep 2024 09:38:17 +0200 Subject: [PATCH 26/66] also convert emptydrops filter matrices --- workflows/scrnaseq.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 1e7ea9ef..a0e7c958 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -303,7 +303,7 @@ workflow SCRNASEQ { EMPTY_DROPLET_REMOVAL ( ch_h5ad_matrices_for_emptydrops ) - // ch_h5ad_matrices = ch_h5ad_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) + ch_h5ad_matrices = ch_h5ad_matrices.mix( EMPTY_DROPLET_REMOVAL.out.h5ad ) } From e9c88e89891a9858bed3fea721432c56d6ea6301 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Thu, 26 Sep 2024 09:40:03 +0200 Subject: [PATCH 27/66] move files to BKP since they will be replaced --- modules/local/{ => BKP}/emptydrops.nf | 0 modules/local/{ => BKP}/mtx_to_h5ad.nf | 0 modules/local/{ => BKP}/mtx_to_seurat.nf | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename modules/local/{ => BKP}/emptydrops.nf (100%) rename modules/local/{ => BKP}/mtx_to_h5ad.nf (100%) rename modules/local/{ => BKP}/mtx_to_seurat.nf (100%) diff --git a/modules/local/emptydrops.nf b/modules/local/BKP/emptydrops.nf similarity index 100% rename from modules/local/emptydrops.nf rename to modules/local/BKP/emptydrops.nf diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/BKP/mtx_to_h5ad.nf similarity index 100% rename from modules/local/mtx_to_h5ad.nf rename to modules/local/BKP/mtx_to_h5ad.nf diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/BKP/mtx_to_seurat.nf similarity index 100% rename from modules/local/mtx_to_seurat.nf rename to modules/local/BKP/mtx_to_seurat.nf From f7f5fa520bb016f99de2bf8e0872aff7bf2eb902 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Thu, 26 Sep 2024 12:16:11 +0200 Subject: [PATCH 28/66] add conversion for alevin --- modules/local/{ => alevin}/alevinqc.nf | 0 modules/local/alevin/mtx_to_h5ad.nf | 34 +++++++ modules/local/{ => alevin}/simpleaf_index.nf | 0 modules/local/{ => alevin}/simpleaf_quant.nf | 0 modules/local/alevin/templates/mtx_to_h5ad.py | 96 +++++++++++++++++++ subworkflows/local/alevin.nf | 25 +++-- workflows/scrnaseq.nf | 2 +- 7 files changed, 148 insertions(+), 9 deletions(-) rename modules/local/{ => alevin}/alevinqc.nf (100%) create mode 100644 modules/local/alevin/mtx_to_h5ad.nf rename modules/local/{ => alevin}/simpleaf_index.nf (100%) rename modules/local/{ => alevin}/simpleaf_quant.nf (100%) create mode 100755 modules/local/alevin/templates/mtx_to_h5ad.py diff --git a/modules/local/alevinqc.nf b/modules/local/alevin/alevinqc.nf similarity index 100% rename from modules/local/alevinqc.nf rename to modules/local/alevin/alevinqc.nf diff --git a/modules/local/alevin/mtx_to_h5ad.nf b/modules/local/alevin/mtx_to_h5ad.nf new file mode 100644 index 00000000..0c4fc844 --- /dev/null +++ b/modules/local/alevin/mtx_to_h5ad.nf @@ -0,0 +1,34 @@ +process MTX_TO_H5AD { + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg" + container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538" + + input: + tuple val(meta), path(inputs) + path star_index + + output: + tuple val(meta2), path("${meta.id}/*h5ad"), emit: h5ad + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Get a file to check input type. Some aligners bring arrays instead of a single file. + def input_to_check = (inputs instanceof String) ? inputs : inputs[0] + + // alevin has a single output + meta2 = meta + [input_type: 'raw'] + + template 'mtx_to_h5ad.py' + + stub: + """ + mkdir ${meta.id} + touch ${meta.id}/${meta.id}_matrix.h5ad + touch versions.yml + """ +} diff --git a/modules/local/simpleaf_index.nf b/modules/local/alevin/simpleaf_index.nf similarity index 100% rename from modules/local/simpleaf_index.nf rename to modules/local/alevin/simpleaf_index.nf diff --git a/modules/local/simpleaf_quant.nf b/modules/local/alevin/simpleaf_quant.nf similarity index 100% rename from modules/local/simpleaf_quant.nf rename to modules/local/alevin/simpleaf_quant.nf diff --git a/modules/local/alevin/templates/mtx_to_h5ad.py b/modules/local/alevin/templates/mtx_to_h5ad.py new file mode 100755 index 00000000..fb784fde --- /dev/null +++ b/modules/local/alevin/templates/mtx_to_h5ad.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +# Set numba chache dir to current working directory (which is a writable mount also in containers) +import os + +os.environ["NUMBA_CACHE_DIR"] = "." + +import scanpy as sc +import pandas as pd +import argparse +from anndata import AnnData +import platform + +def _mtx_to_adata( + input: str, + sample: str, +): + + adata = sc.read_mtx(f"{input}/quants_mat.mtx") + adata.obs_names = pd.read_csv(f"{input}/quants_mat_rows.txt", header=None, sep="\t")[0].values + adata.var_names = pd.read_csv(f"{input}/quants_mat_cols.txt", header=None, sep="\t")[0].values + adata.obs["sample"] = sample + + return adata + + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def dump_versions(): + versions = { + "${task.process}": { + "python": platform.python_version(), + "scanpy": sc.__version__, + "pandas": pd.__version__ + } + } + + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) + +def input_to_adata( + input_data: str, + output: str, + sample: str, +): + print(f"Reading in {input_data}") + + # open main data + adata = _mtx_to_adata(input_data, sample) + + # standard format + # index are gene IDs and symbols are a column + # TODO: how to get gene_symbols for alevin? + adata.var['gene_versions'] = adata.var.index + adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] + adata.var.index = adata.var["gene_ids"].values + adata.var = adata.var.drop("gene_ids", axis=1) + adata.var_names_make_unique() + + # write results + adata.write_h5ad(f"{output}", compression="gzip") + print(f"Wrote h5ad file to {output}") + + # dump versions + dump_versions() + + return adata + +# +# Run main script +# + +# create the directory with the sample name +os.makedirs("${meta2.id}", exist_ok=True) + +# input_type comes from NF module +adata = input_to_adata( + input_data="${meta2.id}_alevin_results/af_quant/alevin/", + output="${meta2.id}/${meta2.id}_${meta2.input_type}_matrix.h5ad", + sample="${meta2.id}" +) diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf index 764c08f8..d8848dc4 100644 --- a/subworkflows/local/alevin.nf +++ b/subworkflows/local/alevin.nf @@ -1,11 +1,12 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ -include { GFFREAD_TRANSCRIPTOME } from '../../modules/local/gffread_transcriptome' -include { ALEVINQC } from '../../modules/local/alevinqc' -include { SIMPLEAF_INDEX } from '../../modules/local/simpleaf_index' -include { SIMPLEAF_QUANT } from '../../modules/local/simpleaf_quant' +include { GFFREAD_TRANSCRIPTOME } from '../../modules/local/gffread_transcriptome' +include { ALEVINQC } from '../../modules/local/alevin/alevinqc' +include { SIMPLEAF_INDEX } from '../../modules/local/alevin/simpleaf_index' +include { SIMPLEAF_QUANT } from '../../modules/local/alevin/simpleaf_quant' +include { MTX_TO_H5AD } from '../../modules/local/alevin/mtx_to_h5ad' /* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ -include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { GUNZIP } from '../../modules/nf-core/gunzip/main' include { GFFREAD as GFFREAD_TXP2GENE } from '../../modules/nf-core/gffread/main' def multiqc_report = [] @@ -44,8 +45,6 @@ workflow SCRNASEQ_ALEVIN { } } - - /* * Perform quantification with salmon alevin */ @@ -58,6 +57,15 @@ workflow SCRNASEQ_ALEVIN { ) ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) + /* + * Perform h5ad conversion + */ + MTX_TO_H5AD ( + SIMPLEAF_QUANT.out.alevin_results, + [] + ) + ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) + /* * Run alevinQC */ @@ -67,5 +75,6 @@ workflow SCRNASEQ_ALEVIN { emit: ch_versions alevin_results = SIMPLEAF_QUANT.out.alevin_results - alevinqc = ALEVINQC.out.report + alevin_h5ad = MTX_TO_H5AD.out.h5ad + alevinqc = ALEVINQC.out.report } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index a0e7c958..c4a63d7d 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -155,7 +155,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_ALEVIN.out.alevin_results.map{ meta, it -> it }) - ch_h5ad_matrices = ch_h5ad_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_results) + ch_h5ad_matrices = ch_h5ad_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_h5ad) } // Run STARSolo pipeline From 73c91c27e0e6798f46d44ce05fdf5155821b68a8 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Thu, 26 Sep 2024 12:28:43 +0200 Subject: [PATCH 29/66] re-organise star modules --- .../local/{mtx_to_h5ad_star.nf => star/mtx_to_h5ad.nf} | 8 ++++---- modules/local/{ => star}/star_align.nf | 0 .../mtx_to_h5ad_star.py => star/templates/mtx_to_h5ad.py} | 8 ++++---- subworkflows/local/starsolo.nf | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) rename modules/local/{mtx_to_h5ad_star.nf => star/mtx_to_h5ad.nf} (80%) rename modules/local/{ => star}/star_align.nf (100%) rename modules/local/{templates/mtx_to_h5ad_star.py => star/templates/mtx_to_h5ad.py} (92%) diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/star/mtx_to_h5ad.nf similarity index 80% rename from modules/local/mtx_to_h5ad_star.nf rename to modules/local/star/mtx_to_h5ad.nf index 84474ed0..8e77749d 100644 --- a/modules/local/mtx_to_h5ad_star.nf +++ b/modules/local/star/mtx_to_h5ad.nf @@ -1,4 +1,4 @@ -process MTX_TO_H5AD_STAR { +process MTX_TO_H5AD { tag "$meta.id" label 'process_medium' @@ -21,10 +21,10 @@ process MTX_TO_H5AD_STAR { def input_to_check = (inputs instanceof String) ? inputs : inputs[0] // check input type of inputs - input_type = (input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' - meta2 = meta + [input_type: input_type] + input_type = (input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' + meta2 = meta + [input_type: input_type] - template 'mtx_to_h5ad_star.py' + template 'mtx_to_h5ad.py' stub: """ diff --git a/modules/local/star_align.nf b/modules/local/star/star_align.nf similarity index 100% rename from modules/local/star_align.nf rename to modules/local/star/star_align.nf diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/star/templates/mtx_to_h5ad.py similarity index 92% rename from modules/local/templates/mtx_to_h5ad_star.py rename to modules/local/star/templates/mtx_to_h5ad.py index 48749114..3a84295d 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/star/templates/mtx_to_h5ad.py @@ -82,11 +82,11 @@ def input_to_adata( # # create the directory with the sample name -os.makedirs("${meta.id}", exist_ok=True) +os.makedirs("${meta2.id}", exist_ok=True) # input_type comes from NF module adata = input_to_adata( - input_data="${input_type}", - output="${meta.id}/${meta.id}_${input_type}_matrix.h5ad", - sample="${meta.id}" + input_data="${meta2.input_type}", + output="${meta2.id}/${meta2.id}_${meta2.input_type}_matrix.h5ad", + sample="${meta2.id}" ) diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 8236632d..3499f707 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -1,6 +1,6 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ -include { STAR_ALIGN } from '../../modules/local/star_align' -include { MTX_TO_H5AD_STAR as MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad_star' +include { STAR_ALIGN } from '../../modules/local/star/star_align' +include { MTX_TO_H5AD } from '../../modules/local/star/mtx_to_h5ad' /* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ include { GUNZIP } from '../../modules/nf-core/gunzip/main' From 417bf698f44419f7ac23a806608738f8ba822de9 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 27 Sep 2024 09:52:37 +0200 Subject: [PATCH 30/66] fixed publishDir directives --- conf/modules.config | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ac26f731..a6fc3411 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,22 +35,25 @@ process { withName: 'CELLBENDER_REMOVEBACKGROUND' { publishDir = [ path: { "${params.outdir}/${params.aligner}/${meta.id}/emptydrops_filter" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } withName: 'ADATA_BARCODES' { ext.prefix = { "${meta.id}_${meta.input_type}_matrix" } publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions/${meta.id}" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } } - withName: 'MTX_TO_H5AD*|CONCAT_H5AD|ANNDATAR_CONVERT' { + withName: 'MTX_TO_H5AD|CONCAT_H5AD|ANNDATAR_CONVERT' { publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } @@ -164,8 +167,9 @@ if (params.aligner == "alevin") { } withName: 'SIMPLEAF_QUANT' { publishDir = [ - path: { "${params.outdir}/${params.aligner}" }, - mode: params.publish_dir_mode + path: { "${params.outdir}/${params.aligner}/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = "-r cr-like" } From 94bf9c11ed6909d6615f059e3aebcc2ac12d4566 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 30 Sep 2024 13:47:53 +0200 Subject: [PATCH 31/66] add h5ad conversion module --- modules/local/mtx_to_h5ad.nf | 35 ++++++ .../local/templates/mtx_to_h5ad_cellranger.py | 101 ++++++++++++++++++ subworkflows/local/align_cellranger.nf | 21 +++- 3 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 modules/local/mtx_to_h5ad.nf create mode 100755 modules/local/templates/mtx_to_h5ad_cellranger.py diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf new file mode 100644 index 00000000..cb3a0a4f --- /dev/null +++ b/modules/local/mtx_to_h5ad.nf @@ -0,0 +1,35 @@ +process MTX_TO_H5AD { + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : + 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + + input: + // inputs from cellranger nf-core module does not come in a single sample dir + // for each sample, the sub-folders and files come directly in array. + tuple val(meta), path(inputs) + path txp2gene + path star_index + + output: + tuple val(input_type), path("${meta.id}/*h5ad") , emit: h5ad + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def aligner = (params.aligner in [ 'cellranger', 'cellrangerarc', 'cellrangermulti' ]) ? 'cellranger' : params.aligner + + template "mtx_to_h5ad_${aligner}.py" + + stub: + """ + mkdir ${meta.id} + touch ${meta.id}/${meta.id}_matrix.h5ad + touch versions.yml + """ +} diff --git a/modules/local/templates/mtx_to_h5ad_cellranger.py b/modules/local/templates/mtx_to_h5ad_cellranger.py new file mode 100755 index 00000000..a91fe0cf --- /dev/null +++ b/modules/local/templates/mtx_to_h5ad_cellranger.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +# Set numba chache dir to current working directory (which is a writable mount also in containers) +import os + +os.environ["NUMBA_CACHE_DIR"] = "." + +import scanpy as sc +import pandas as pd +import argparse +from anndata import AnnData +import platform + +def _mtx_to_adata( + input: str, + sample: str, +): + + adata = sc.read_10x_h5(input) + adata.var["gene_symbols"] = adata.var_names + adata.var.set_index("gene_ids", inplace=True) + adata.obs["sample"] = sample + + # reorder columns for 10x mtx files + adata.var = adata.var[["gene_symbols", "feature_types", "genome"]] + + return adata + + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def dump_versions(): + versions = { + "${task.process}": { + "python": platform.python_version(), + "scanpy": sc.__version__, + "pandas": pd.__version__ + } + } + + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) + +def input_to_adata( + input_data: str, + output: str, + sample: str, +): + print(f"Reading in {input_data}") + + # open main data + adata = _mtx_to_adata(input_data, sample) + + # standard format + # index are gene IDs and symbols are a column + adata.var['gene_versions'] = adata.var.index + adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] + adata.var.index = adata.var["gene_ids"].values + adata.var = adata.var.drop("gene_ids", axis=1) + adata.var_names_make_unique() + + print(adata) + print(adata.var) + + # write results + adata.write_h5ad(f"{output}", compression="gzip") + print(f"Wrote h5ad file to {output}") + + # dump versions + dump_versions() + + return adata + +# +# Run main script +# + +# create the directory with the sample name +os.makedirs("${meta.id}", exist_ok=True) + +# input_type comes from NF module +adata = input_to_adata( + input_data="${meta.input_type}_feature_bc_matrix.h5", + output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + sample="${meta.id}" +) diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf index 2461373b..413efc02 100644 --- a/subworkflows/local/align_cellranger.nf +++ b/subworkflows/local/align_cellranger.nf @@ -2,9 +2,10 @@ * Alignment with Cellranger */ -include {CELLRANGER_MKGTF} from "../../modules/nf-core/cellranger/mkgtf/main.nf" -include {CELLRANGER_MKREF} from "../../modules/nf-core/cellranger/mkref/main.nf" -include {CELLRANGER_COUNT} from "../../modules/nf-core/cellranger/count/main.nf" +include { CELLRANGER_MKGTF } from "../../modules/nf-core/cellranger/mkgtf/main.nf" +include { CELLRANGER_MKREF } from "../../modules/nf-core/cellranger/mkref/main.nf" +include { CELLRANGER_COUNT } from "../../modules/nf-core/cellranger/count/main.nf" +include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad' // Define workflow to subset and index a genome region fasta file workflow CELLRANGER_ALIGN { @@ -49,7 +50,7 @@ workflow CELLRANGER_ALIGN { mtx_files.each{ if ( it.toString().contains("raw_feature_bc_matrix") ) { desired_files.add( it ) } } - [ meta, desired_files ] + [ meta + [input_type: 'raw'], desired_files ] } ch_matrices_filtered = @@ -58,9 +59,19 @@ workflow CELLRANGER_ALIGN { mtx_files.each{ if ( it.toString().contains("filtered_feature_bc_matrix") ) { desired_files.add( it ) } } - [ meta, desired_files ] + [ meta + [input_type: 'filtered'], desired_files ] } + /* + * Perform h5ad conversion + */ + MTX_TO_H5AD ( + ch_matrices_raw.mix( ch_matrices_filtered ), + [], + [] + ) + ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) + emit: ch_versions cellranger_out = CELLRANGER_COUNT.out.outs From 43a29c22e632dcdbfbe452debd836b750efcb54b Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 30 Sep 2024 14:32:39 +0200 Subject: [PATCH 32/66] integrate to mtx_conversion module currently receiving the error: 'H5AD encoding information is missing. This file may have been created with Python anndata<0.8.0.' --- modules/local/anndatar_convert.nf | 2 +- modules/local/mtx_to_h5ad.nf | 4 ++-- modules/local/templates/mtx_to_h5ad_cellranger.py | 3 --- subworkflows/local/align_cellranger.nf | 1 + workflows/scrnaseq.nf | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf index a3a67a08..d6a8271d 100644 --- a/modules/local/anndatar_convert.nf +++ b/modules/local/anndatar_convert.nf @@ -3,7 +3,7 @@ process ANNDATAR_CONVERT { label 'process_medium' - container "fmalmeida/anndatar:dev" // TODO: Fix + container "docker://fmalmeida/anndatar:dev" // TODO: Fix input: tuple val(meta), path(h5ad) diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index cb3a0a4f..f708162d 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -15,8 +15,8 @@ process MTX_TO_H5AD { path star_index output: - tuple val(input_type), path("${meta.id}/*h5ad") , emit: h5ad - path "versions.yml" , emit: versions + tuple val(meta), path("${meta.id}/*h5ad"), emit: h5ad + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/templates/mtx_to_h5ad_cellranger.py b/modules/local/templates/mtx_to_h5ad_cellranger.py index a91fe0cf..294a15e7 100755 --- a/modules/local/templates/mtx_to_h5ad_cellranger.py +++ b/modules/local/templates/mtx_to_h5ad_cellranger.py @@ -74,9 +74,6 @@ def input_to_adata( adata.var = adata.var.drop("gene_ids", axis=1) adata.var_names_make_unique() - print(adata) - print(adata.var) - # write results adata.write_h5ad(f"{output}", compression="gzip") print(f"Wrote h5ad file to {output}") diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf index 413efc02..09442313 100644 --- a/subworkflows/local/align_cellranger.nf +++ b/subworkflows/local/align_cellranger.nf @@ -76,5 +76,6 @@ workflow CELLRANGER_ALIGN { ch_versions cellranger_out = CELLRANGER_COUNT.out.outs cellranger_matrices = ch_matrices_raw.mix( ch_matrices_filtered ) + cellranger_h5ad = MTX_TO_H5AD.out.h5ad star_index = cellranger_index } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index c4a63d7d..4b6323ee 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -185,7 +185,7 @@ workflow SCRNASEQ { protocol_config['protocol'] ) ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) - ch_star_index = CELLRANGER_ALIGN.out.star_index + ch_h5ad_matrices = ch_h5ad_matrices.mix(CELLRANGER_ALIGN.out.cellranger_h5ad) ch_multiqc_files = ch_multiqc_files.mix(CELLRANGER_ALIGN.out.cellranger_out.map{ meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} }) From 7fc32f0a7bd6b701739156bd16c15fa6394a37ab Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 30 Sep 2024 14:48:11 +0200 Subject: [PATCH 33/66] reorganize module levels and fix docker image --- modules/local/alevin/mtx_to_h5ad.nf | 34 ------------------- modules/local/{alevin => }/alevinqc.nf | 0 modules/local/mtx_to_h5ad.nf | 6 ++-- modules/local/{alevin => }/simpleaf_index.nf | 0 modules/local/{alevin => }/simpleaf_quant.nf | 0 .../mtx_to_h5ad_alevin.py} | 8 ++--- subworkflows/local/alevin.nf | 11 +++--- 7 files changed, 12 insertions(+), 47 deletions(-) delete mode 100644 modules/local/alevin/mtx_to_h5ad.nf rename modules/local/{alevin => }/alevinqc.nf (100%) rename modules/local/{alevin => }/simpleaf_index.nf (100%) rename modules/local/{alevin => }/simpleaf_quant.nf (100%) rename modules/local/{alevin/templates/mtx_to_h5ad.py => templates/mtx_to_h5ad_alevin.py} (92%) diff --git a/modules/local/alevin/mtx_to_h5ad.nf b/modules/local/alevin/mtx_to_h5ad.nf deleted file mode 100644 index 0c4fc844..00000000 --- a/modules/local/alevin/mtx_to_h5ad.nf +++ /dev/null @@ -1,34 +0,0 @@ -process MTX_TO_H5AD { - tag "$meta.id" - label 'process_medium' - - conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg" - container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538" - - input: - tuple val(meta), path(inputs) - path star_index - - output: - tuple val(meta2), path("${meta.id}/*h5ad"), emit: h5ad - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - // Get a file to check input type. Some aligners bring arrays instead of a single file. - def input_to_check = (inputs instanceof String) ? inputs : inputs[0] - - // alevin has a single output - meta2 = meta + [input_type: 'raw'] - - template 'mtx_to_h5ad.py' - - stub: - """ - mkdir ${meta.id} - touch ${meta.id}/${meta.id}_matrix.h5ad - touch versions.yml - """ -} diff --git a/modules/local/alevin/alevinqc.nf b/modules/local/alevinqc.nf similarity index 100% rename from modules/local/alevin/alevinqc.nf rename to modules/local/alevinqc.nf diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index f708162d..751cf9a5 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -2,10 +2,8 @@ process MTX_TO_H5AD { tag "$meta.id" label 'process_medium' - conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : - 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg" + container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538" input: // inputs from cellranger nf-core module does not come in a single sample dir diff --git a/modules/local/alevin/simpleaf_index.nf b/modules/local/simpleaf_index.nf similarity index 100% rename from modules/local/alevin/simpleaf_index.nf rename to modules/local/simpleaf_index.nf diff --git a/modules/local/alevin/simpleaf_quant.nf b/modules/local/simpleaf_quant.nf similarity index 100% rename from modules/local/alevin/simpleaf_quant.nf rename to modules/local/simpleaf_quant.nf diff --git a/modules/local/alevin/templates/mtx_to_h5ad.py b/modules/local/templates/mtx_to_h5ad_alevin.py similarity index 92% rename from modules/local/alevin/templates/mtx_to_h5ad.py rename to modules/local/templates/mtx_to_h5ad_alevin.py index fb784fde..fba5a34f 100755 --- a/modules/local/alevin/templates/mtx_to_h5ad.py +++ b/modules/local/templates/mtx_to_h5ad_alevin.py @@ -86,11 +86,11 @@ def input_to_adata( # # create the directory with the sample name -os.makedirs("${meta2.id}", exist_ok=True) +os.makedirs("${meta.id}", exist_ok=True) # input_type comes from NF module adata = input_to_adata( - input_data="${meta2.id}_alevin_results/af_quant/alevin/", - output="${meta2.id}/${meta2.id}_${meta2.input_type}_matrix.h5ad", - sample="${meta2.id}" + input_data="${meta.id}_alevin_results/af_quant/alevin/", + output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + sample="${meta.id}" ) diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf index d8848dc4..0aa2483f 100644 --- a/subworkflows/local/alevin.nf +++ b/subworkflows/local/alevin.nf @@ -1,9 +1,9 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ include { GFFREAD_TRANSCRIPTOME } from '../../modules/local/gffread_transcriptome' -include { ALEVINQC } from '../../modules/local/alevin/alevinqc' -include { SIMPLEAF_INDEX } from '../../modules/local/alevin/simpleaf_index' -include { SIMPLEAF_QUANT } from '../../modules/local/alevin/simpleaf_quant' -include { MTX_TO_H5AD } from '../../modules/local/alevin/mtx_to_h5ad' +include { ALEVINQC } from '../../modules/local/alevinqc' +include { SIMPLEAF_INDEX } from '../../modules/local/simpleaf_index' +include { SIMPLEAF_QUANT } from '../../modules/local/simpleaf_quant' +include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad' /* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ include { GUNZIP } from '../../modules/nf-core/gunzip/main' @@ -61,7 +61,8 @@ workflow SCRNASEQ_ALEVIN { * Perform h5ad conversion */ MTX_TO_H5AD ( - SIMPLEAF_QUANT.out.alevin_results, + SIMPLEAF_QUANT.out.alevin_results.map{ meta, files -> [meta + [input_type: 'raw'], files] }, + [], [] ) ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) From e36639681a00c8f959671cad2d85d2e9b19c5a48 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 30 Sep 2024 15:04:22 +0200 Subject: [PATCH 34/66] reorganize star modules --- modules/local/star/mtx_to_h5ad.nf | 35 ------------------- modules/local/{star => }/star_align.nf | 0 .../mtx_to_h5ad_star.py} | 8 ++--- subworkflows/local/starsolo.nf | 9 +++-- 4 files changed, 10 insertions(+), 42 deletions(-) delete mode 100644 modules/local/star/mtx_to_h5ad.nf rename modules/local/{star => }/star_align.nf (100%) rename modules/local/{star/templates/mtx_to_h5ad.py => templates/mtx_to_h5ad_star.py} (92%) diff --git a/modules/local/star/mtx_to_h5ad.nf b/modules/local/star/mtx_to_h5ad.nf deleted file mode 100644 index 8e77749d..00000000 --- a/modules/local/star/mtx_to_h5ad.nf +++ /dev/null @@ -1,35 +0,0 @@ -process MTX_TO_H5AD { - tag "$meta.id" - label 'process_medium' - - conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg" - container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538" - - input: - tuple val(meta), path(inputs) - path star_index - - output: - tuple val(meta2), path("${meta.id}/*h5ad"), emit: h5ad - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - // Get a file to check input type. Some aligners bring arrays instead of a single file. - def input_to_check = (inputs instanceof String) ? inputs : inputs[0] - - // check input type of inputs - input_type = (input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' - meta2 = meta + [input_type: input_type] - - template 'mtx_to_h5ad.py' - - stub: - """ - mkdir ${meta.id} - touch ${meta.id}/${meta.id}_matrix.h5ad - touch versions.yml - """ -} diff --git a/modules/local/star/star_align.nf b/modules/local/star_align.nf similarity index 100% rename from modules/local/star/star_align.nf rename to modules/local/star_align.nf diff --git a/modules/local/star/templates/mtx_to_h5ad.py b/modules/local/templates/mtx_to_h5ad_star.py similarity index 92% rename from modules/local/star/templates/mtx_to_h5ad.py rename to modules/local/templates/mtx_to_h5ad_star.py index 3a84295d..49a1837e 100755 --- a/modules/local/star/templates/mtx_to_h5ad.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -82,11 +82,11 @@ def input_to_adata( # # create the directory with the sample name -os.makedirs("${meta2.id}", exist_ok=True) +os.makedirs("${meta.id}", exist_ok=True) # input_type comes from NF module adata = input_to_adata( - input_data="${meta2.input_type}", - output="${meta2.id}/${meta2.id}_${meta2.input_type}_matrix.h5ad", - sample="${meta2.id}" + input_data="${meta.input_type}", + output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + sample="${meta.id}" ) diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 3499f707..6b71dbdd 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -1,6 +1,6 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ -include { STAR_ALIGN } from '../../modules/local/star/star_align' -include { MTX_TO_H5AD } from '../../modules/local/star/mtx_to_h5ad' +include { STAR_ALIGN } from '../../modules/local/star_align' +include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad' /* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ include { GUNZIP } from '../../modules/nf-core/gunzip/main' @@ -58,7 +58,10 @@ workflow STARSOLO { * Perform h5ad conversion */ MTX_TO_H5AD ( - STAR_ALIGN.out.raw_counts.mix( STAR_ALIGN.out.filtered_counts ), + STAR_ALIGN.out.raw_counts + .map{ meta, files -> [meta + [input_type: 'raw'], files] } + .mix( STAR_ALIGN.out.filtered_counts.map{ meta, files -> [meta + [input_type: 'filtered'], files] } ), + [], star_index.map{ meta, index -> index } ) ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) From 76f126ebc0d804266bcaa49cce5ca71718e37080 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 4 Oct 2024 09:36:44 +0200 Subject: [PATCH 35/66] re-structure and include cellranger --- subworkflows/local/align_cellranger.nf | 12 ------- subworkflows/local/mtx_conversion.nf | 33 ++++++++++++++++++-- workflows/scrnaseq.nf | 43 +++++++------------------- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf index 09442313..86b69a38 100644 --- a/subworkflows/local/align_cellranger.nf +++ b/subworkflows/local/align_cellranger.nf @@ -5,7 +5,6 @@ include { CELLRANGER_MKGTF } from "../../modules/nf-core/cellranger/mkgtf/main.nf" include { CELLRANGER_MKREF } from "../../modules/nf-core/cellranger/mkref/main.nf" include { CELLRANGER_COUNT } from "../../modules/nf-core/cellranger/count/main.nf" -include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad' // Define workflow to subset and index a genome region fasta file workflow CELLRANGER_ALIGN { @@ -62,20 +61,9 @@ workflow CELLRANGER_ALIGN { [ meta + [input_type: 'filtered'], desired_files ] } - /* - * Perform h5ad conversion - */ - MTX_TO_H5AD ( - ch_matrices_raw.mix( ch_matrices_filtered ), - [], - [] - ) - ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) - emit: ch_versions cellranger_out = CELLRANGER_COUNT.out.outs cellranger_matrices = ch_matrices_raw.mix( ch_matrices_filtered ) - cellranger_h5ad = MTX_TO_H5AD.out.h5ad star_index = cellranger_index } diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 6435966d..3ecd1b36 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -1,27 +1,56 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad' include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' include { ANNDATAR_CONVERT } from '../../modules/local/anndatar_convert' +include { EMPTY_DROPLET_REMOVAL } from '../../subworkflows/local/emptydrops_removal' workflow MTX_CONVERSION { take: mtx_matrices + txp2gene + star_index samplesheet main: ch_versions = Channel.empty() + ch_h5ads = Channel.empty() + + // + // MODULE: Convert matrices to h5ad + // + MTX_TO_H5AD ( + mtx_matrices, + txp2gene, + star_index + ) + ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) + ch_h5ads = MTX_TO_H5AD.out.h5ad + + // + // SUBWORKFLOW: Run cellbender emptydrops filter + // + if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { + + // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it + EMPTY_DROPLET_REMOVAL ( + MTX_TO_H5AD.out.h5ad.filter { meta, mtx_files -> meta.input_type == 'raw' } + ) + ch_h5ads = ch_h5ads.mix( EMPTY_DROPLET_REMOVAL.out.h5ad ) + + } // // MODULE: Convert to Rds with AnndataR package // ANNDATAR_CONVERT ( - mtx_matrices + ch_h5ads ) // // Concat sample-specific h5ad in one // - ch_concat_h5ad_input = mtx_matrices.groupTuple() // gather all sample-specific files / per type + ch_concat_h5ad_input = ch_h5ads.groupTuple() // gather all sample-specific files / per type if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') { // when having spliced / unspliced matrices, the collected tuple has two levels ( [[mtx_1, mtx_2]] ) // which nextflow break because it is not a valid 'path' thus, we have to remove one level diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 4b6323ee..282a7ab9 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -7,7 +7,6 @@ include { CELLRANGER_ALIGN } from '../subworkflows/local/align include { CELLRANGER_MULTI_ALIGN } from '../subworkflows/local/align_cellrangermulti' include { CELLRANGERARC_ALIGN } from '../subworkflows/local/align_cellrangerarc' include { UNIVERSC_ALIGN } from '../subworkflows/local/align_universc' -include { EMPTY_DROPLET_REMOVAL } from '../subworkflows/local/emptydrops_removal' include { MTX_CONVERSION } from '../subworkflows/local/mtx_conversion' include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' include { GUNZIP as GUNZIP_FASTA } from '../modules/nf-core/gunzip/main' @@ -87,7 +86,7 @@ workflow SCRNASEQ { empty_file = file("$projectDir/assets/EMPTY", checkIfExists: true) ch_versions = Channel.empty() - ch_h5ad_matrices = Channel.empty() + ch_mtx_matrices = Channel.empty() // Run FastQC if (!params.skip_fastqc) { @@ -137,7 +136,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) - ch_h5ad_matrices = ch_h5ad_matrices.mix(KALLISTO_BUSTOOLS.out.raw_counts, KALLISTO_BUSTOOLS.out.filtered_counts) + ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.raw_counts, KALLISTO_BUSTOOLS.out.filtered_counts) ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } @@ -155,7 +154,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_ALEVIN.out.alevin_results.map{ meta, it -> it }) - ch_h5ad_matrices = ch_h5ad_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_h5ad) + ch_mtx_matrices = ch_mtx_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_h5ad) } // Run STARSolo pipeline @@ -172,7 +171,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) - ch_h5ad_matrices = STARSOLO.out.star_h5ad + ch_mtx_matrices = STARSOLO.out.star_h5ad } // Run cellranger pipeline @@ -185,8 +184,8 @@ workflow SCRNASEQ { protocol_config['protocol'] ) ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) - ch_h5ad_matrices = ch_h5ad_matrices.mix(CELLRANGER_ALIGN.out.cellranger_h5ad) - ch_multiqc_files = ch_multiqc_files.mix(CELLRANGER_ALIGN.out.cellranger_out.map{ + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_matrices) + ch_multiqc_files = ch_multiqc_files.mix(CELLRANGER_ALIGN.out.cellranger_out.map { meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} }) } @@ -201,7 +200,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(UNIVERSC_ALIGN.out.ch_versions) - ch_h5ad_matrices = ch_h5ad_matrices.mix(UNIVERSC_ALIGN.out.universc_out) + ch_mtx_matrices = ch_mtx_matrices.mix(UNIVERSC_ALIGN.out.universc_out) } // Run cellrangerarc pipeline @@ -215,7 +214,7 @@ workflow SCRNASEQ { ch_cellrangerarc_config ) ch_versions = ch_versions.mix(CELLRANGERARC_ALIGN.out.ch_versions) - ch_h5ad_matrices = ch_h5ad_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) } // Run cellrangermulti pipeline @@ -283,33 +282,15 @@ workflow SCRNASEQ { ch_multiqc_files = ch_multiqc_files.mix( CELLRANGER_MULTI_ALIGN.out.cellrangermulti_out.map{ meta, outs -> outs.findAll{ it -> it.name == "web_summary.html" } }) - ch_h5ad_matrices = ch_h5ad_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) - - } - - // SUBWORKFLOW: Run cellbender emptydrops filter - if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { - - // - // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it - // - if ( params.aligner in [ 'cellranger', 'cellrangermulti', 'kallisto', 'star' ] ) { - ch_h5ad_matrices_for_emptydrops = - ch_h5ad_matrices.filter { meta, mtx_files -> meta.input_type == 'raw' } - } else { - ch_h5ad_matrices_for_emptydrops = ch_h5ad_matrices - } - - EMPTY_DROPLET_REMOVAL ( - ch_h5ad_matrices_for_emptydrops - ) - ch_h5ad_matrices = ch_h5ad_matrices.mix( EMPTY_DROPLET_REMOVAL.out.h5ad ) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) } // Run mtx to h5ad conversion subworkflow MTX_CONVERSION ( - ch_h5ad_matrices, + ch_mtx_matrices, + ch_txp2gene, + ch_star_index, ch_input ) From b92af8b43e9716b9d1e90e29441f7a531170e564 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 4 Oct 2024 09:47:58 +0200 Subject: [PATCH 36/66] add alevin to new structure --- subworkflows/local/alevin.nf | 14 +------------- workflows/scrnaseq.nf | 2 +- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf index 0aa2483f..ae98cc85 100644 --- a/subworkflows/local/alevin.nf +++ b/subworkflows/local/alevin.nf @@ -3,7 +3,6 @@ include { GFFREAD_TRANSCRIPTOME } from '../../modules/local/gffread_transcriptom include { ALEVINQC } from '../../modules/local/alevinqc' include { SIMPLEAF_INDEX } from '../../modules/local/simpleaf_index' include { SIMPLEAF_QUANT } from '../../modules/local/simpleaf_quant' -include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad' /* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ include { GUNZIP } from '../../modules/nf-core/gunzip/main' @@ -57,16 +56,6 @@ workflow SCRNASEQ_ALEVIN { ) ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) - /* - * Perform h5ad conversion - */ - MTX_TO_H5AD ( - SIMPLEAF_QUANT.out.alevin_results.map{ meta, files -> [meta + [input_type: 'raw'], files] }, - [], - [] - ) - ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) - /* * Run alevinQC */ @@ -75,7 +64,6 @@ workflow SCRNASEQ_ALEVIN { emit: ch_versions - alevin_results = SIMPLEAF_QUANT.out.alevin_results - alevin_h5ad = MTX_TO_H5AD.out.h5ad + alevin_results = SIMPLEAF_QUANT.out.alevin_results.map{ meta, files -> [meta + [input_type: 'raw'], files] } alevinqc = ALEVINQC.out.report } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 282a7ab9..a55029ea 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -154,7 +154,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_ALEVIN.out.alevin_results.map{ meta, it -> it }) - ch_mtx_matrices = ch_mtx_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_h5ad) + ch_mtx_matrices = ch_mtx_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_results) } // Run STARSolo pipeline From c57b09fc98fe0b21c54234dcf3db729f1ab04f60 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 4 Oct 2024 10:29:37 +0200 Subject: [PATCH 37/66] add star to new structure --- subworkflows/local/starsolo.nf | 18 +++++------------- workflows/scrnaseq.nf | 2 +- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 6b71dbdd..d69ce9d9 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -54,24 +54,16 @@ workflow STARSOLO { ) ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) - /* - * Perform h5ad conversion - */ - MTX_TO_H5AD ( - STAR_ALIGN.out.raw_counts - .map{ meta, files -> [meta + [input_type: 'raw'], files] } - .mix( STAR_ALIGN.out.filtered_counts.map{ meta, files -> [meta + [input_type: 'filtered'], files] } ), - [], - star_index.map{ meta, index -> index } - ) - ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) + // generate channel of star counts with correct metadata + ch_star_counts = + STAR_ALIGN.out.raw_counts.map{ meta, files -> [meta + [input_type: 'raw'], files] } + .mix( STAR_ALIGN.out.filtered_counts.map{ meta, files -> [meta + [input_type: 'filtered'], files] } ) emit: ch_versions // get rid of meta for star index star_result = STAR_ALIGN.out.tab - star_counts = STAR_ALIGN.out.counts - star_h5ad = MTX_TO_H5AD.out.h5ad + star_counts = ch_star_counts for_multiqc = STAR_ALIGN.out.log_final.map{ meta, it -> it } } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index a55029ea..2f2301c1 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -171,7 +171,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) - ch_mtx_matrices = STARSOLO.out.star_h5ad + ch_mtx_matrices = STARSOLO.out.star_counts } // Run cellranger pipeline From d0d1f814b56971d1757e681a6a29f022a80080fd Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 4 Oct 2024 13:04:06 +0200 Subject: [PATCH 38/66] Add kallisto standard workflow to structure --- modules/local/templates/mtx_to_h5ad_alevin.py | 4 +- .../local/templates/mtx_to_h5ad_kallisto.py | 103 ++++++++++++++++++ subworkflows/local/kallisto_bustools.nf | 8 +- workflows/scrnaseq.nf | 2 +- 4 files changed, 109 insertions(+), 8 deletions(-) create mode 100755 modules/local/templates/mtx_to_h5ad_kallisto.py diff --git a/modules/local/templates/mtx_to_h5ad_alevin.py b/modules/local/templates/mtx_to_h5ad_alevin.py index fba5a34f..50107e0c 100755 --- a/modules/local/templates/mtx_to_h5ad_alevin.py +++ b/modules/local/templates/mtx_to_h5ad_alevin.py @@ -17,8 +17,8 @@ def _mtx_to_adata( ): adata = sc.read_mtx(f"{input}/quants_mat.mtx") - adata.obs_names = pd.read_csv(f"{input}/quants_mat_rows.txt", header=None, sep="\t")[0].values - adata.var_names = pd.read_csv(f"{input}/quants_mat_cols.txt", header=None, sep="\t")[0].values + adata.obs_names = pd.read_csv(f"{input}/quants_mat_rows.txt", header=None, sep="\\t")[0].values + adata.var_names = pd.read_csv(f"{input}/quants_mat_cols.txt", header=None, sep="\\t")[0].values adata.obs["sample"] = sample return adata diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py new file mode 100755 index 00000000..cf27f13c --- /dev/null +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +# Set numba chache dir to current working directory (which is a writable mount also in containers) +import os + +os.environ["NUMBA_CACHE_DIR"] = "." + +import scanpy as sc +import pandas as pd +import argparse +from anndata import AnnData +import platform +import glob + +def _mtx_to_adata( + input: str, + sample: str, + t2g: str +): + + adata = sc.read_mtx(glob.glob(f"{input}/*.mtx")[0]) + adata.obs_names = pd.read_csv(glob.glob(f"{input}/*.barcodes.txt")[0], header=None, sep="\\t")[0].values + adata.var_names = pd.read_csv(glob.glob(f"{input}/*.genes.txt")[0], header=None, sep="\\t")[0].values + adata.obs["sample"] = sample + + txp2gene = pd.read_table(glob.glob(f"{t2g}")[0], header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) + txp2gene = txp2gene.drop_duplicates(subset="gene_id").set_index("gene_id") + adata.var = pd.merge(adata.var, txp2gene, left_index=True, right_index=True) + + return adata + + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def dump_versions(): + versions = { + "${task.process}": { + "python": platform.python_version(), + "scanpy": sc.__version__, + "pandas": pd.__version__ + } + } + + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) + +def input_to_adata( + input_data: str, + output: str, + sample: str, + t2g: str +): + print(f"Reading in {input_data}") + + # open main data + adata = _mtx_to_adata(input_data, sample, t2g) + + # standard format + # index are gene IDs and symbols are a column + adata.var['gene_versions'] = adata.var.index + adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] + adata.var.index = adata.var["gene_ids"].values + adata.var = adata.var.drop("gene_ids", axis=1) + adata.var_names_make_unique() + + # write results + adata.write_h5ad(f"{output}", compression="gzip") + print(f"Wrote h5ad file to {output}") + + # dump versions + dump_versions() + + return adata + +# +# Run main script +# + +# create the directory with the sample name +os.makedirs("${meta.id}", exist_ok=True) + +# input_type comes from NF module +adata = input_to_adata( + input_data="${inputs}", + output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + sample="${meta.id}", + t2g="${txp2gene}" +) diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf index 3deee2c5..d6b171a6 100644 --- a/subworkflows/local/kallisto_bustools.nf +++ b/subworkflows/local/kallisto_bustools.nf @@ -55,20 +55,18 @@ workflow KALLISTO_BUSTOOLS { // get raw/filtered counts ch_raw_counts = KALLISTOBUSTOOLS_COUNT.out.count.map{ meta, kb_dir -> if (file("${kb_dir.toUriString()}/counts_unfiltered").exists()) { - [meta, file("${kb_dir.toUriString()}/counts_unfiltered")] + [meta + [input_type: 'raw'], file("${kb_dir.toUriString()}/counts_unfiltered")] } } ch_filtered_counts = KALLISTOBUSTOOLS_COUNT.out.count.map{ meta, kb_dir -> if (file("${kb_dir.toUriString()}/counts_filtered").exists()) { - [meta, file("${kb_dir.toUriString()}/counts_filtered")] + [meta + [input_type: 'filtered'], file("${kb_dir.toUriString()}/counts_filtered")] } } emit: ch_versions - counts = KALLISTOBUSTOOLS_COUNT.out.count - raw_counts = ch_raw_counts - filtered_counts = ch_filtered_counts + counts = ch_raw_counts.mix (ch_filtered_counts) txp2gene = txp2gene.collect() } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 2f2301c1..3f3c1017 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -136,7 +136,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.raw_counts, KALLISTO_BUSTOOLS.out.filtered_counts) + ch_mtx_matrices = KALLISTO_BUSTOOLS.out.counts ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } From aa99cd17eac212e350d1d20fe251ccd187c9343c Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 4 Oct 2024 16:16:11 +0200 Subject: [PATCH 39/66] Account for non-standard kallisto workflows --- .../local/templates/mtx_to_h5ad_kallisto.py | 58 ++++++++++++------- subworkflows/local/emptydrops_removal.nf | 2 +- subworkflows/local/mtx_conversion.nf | 20 ++++++- 3 files changed, 56 insertions(+), 24 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py index cf27f13c..24efe7e0 100755 --- a/modules/local/templates/mtx_to_h5ad_kallisto.py +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -13,14 +13,16 @@ import glob def _mtx_to_adata( - input: str, - sample: str, - t2g: str + matrix: str, + barcodes: str, + features: str, + t2g: str, + sample: str ): - adata = sc.read_mtx(glob.glob(f"{input}/*.mtx")[0]) - adata.obs_names = pd.read_csv(glob.glob(f"{input}/*.barcodes.txt")[0], header=None, sep="\\t")[0].values - adata.var_names = pd.read_csv(glob.glob(f"{input}/*.genes.txt")[0], header=None, sep="\\t")[0].values + adata = sc.read_mtx(matrix) + adata.obs_names = pd.read_csv(barcodes, header=None, sep="\\t")[0].values + adata.var_names = pd.read_csv(features, header=None, sep="\\t")[0].values adata.obs["sample"] = sample txp2gene = pd.read_table(glob.glob(f"{t2g}")[0], header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) @@ -60,15 +62,17 @@ def dump_versions(): f.write(format_yaml_like(versions)) def input_to_adata( - input_data: str, + matrix: str, + barcodes: str, + features: str, + t2g: str, output: str, sample: str, - t2g: str ): - print(f"Reading in {input_data}") + print(f"Reading in {matrix}") # open main data - adata = _mtx_to_adata(input_data, sample, t2g) + adata = _mtx_to_adata(matrix=matrix, barcodes=barcodes, features=features, sample=sample, t2g=t2g) # standard format # index are gene IDs and symbols are a column @@ -82,11 +86,6 @@ def input_to_adata( adata.write_h5ad(f"{output}", compression="gzip") print(f"Wrote h5ad file to {output}") - # dump versions - dump_versions() - - return adata - # # Run main script # @@ -95,9 +94,26 @@ def input_to_adata( os.makedirs("${meta.id}", exist_ok=True) # input_type comes from NF module -adata = input_to_adata( - input_data="${inputs}", - output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", - sample="${meta.id}", - t2g="${txp2gene}" -) +if "${params.kb_workflow}" == "standard": + input_to_adata( + matrix=glob.glob("${inputs}/*.mtx")[0], + barcodes=glob.glob("${inputs}/*.barcodes.txt")[0], + features=glob.glob("${inputs}/*.genes.txt")[0], + output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + sample="${meta.id}", + t2g="${txp2gene}" + ) + +else: + for type in ['spliced', 'unspliced']: + input_to_adata( + matrix=glob.glob("${inputs}/" + f"{type}*.mtx")[0], + barcodes=glob.glob("${inputs}/" + f"{type}*.barcodes.txt")[0], + features=glob.glob("${inputs}/" + f"{type}*.genes.txt")[0], + output="${meta.id}/${meta.id}_${meta.input_type}" + f"_{type}_matrix.h5ad", + sample="${meta.id}", + t2g="${txp2gene}" + ) + +# dump versions +dump_versions() diff --git a/subworkflows/local/emptydrops_removal.nf b/subworkflows/local/emptydrops_removal.nf index 7171c3f3..2ccacc26 100644 --- a/subworkflows/local/emptydrops_removal.nf +++ b/subworkflows/local/emptydrops_removal.nf @@ -16,7 +16,7 @@ workflow EMPTY_DROPLET_REMOVAL { .join(CELLBENDER_REMOVEBACKGROUND.out.barcodes) .map { meta, h5ad, csv -> def meta_clone = meta.clone() - meta_clone.input_type = 'emptydrops_filter' + meta_clone.input_type = meta['input_type'].toString().replaceAll('raw', 'emptydrops_filter') [ meta_clone, h5ad, csv ] } diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 3ecd1b36..55bd68dc 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -25,7 +25,23 @@ workflow MTX_CONVERSION { star_index ) ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions.first()) - ch_h5ads = MTX_TO_H5AD.out.h5ad + + // fix channel size when kallisto non-standard workflow + if (params.aligner == 'kallisto' && !(params.kb_workflow == 'standard')) { + ch_h5ads = + MTX_TO_H5AD.out.h5ad + .transpose() + .map { meta, h5ad -> + def meta_clone = meta.clone() + def spc_prefix = h5ad.toString().contains('unspliced') ? 'un' : '' + + meta_clone["input_type"] = "${meta.input_type}_${spc_prefix}spliced" + + [ meta_clone, h5ad ] + } + } else { + ch_h5ads = MTX_TO_H5AD.out.h5ad + } // // SUBWORKFLOW: Run cellbender emptydrops filter @@ -34,7 +50,7 @@ workflow MTX_CONVERSION { // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it EMPTY_DROPLET_REMOVAL ( - MTX_TO_H5AD.out.h5ad.filter { meta, mtx_files -> meta.input_type == 'raw' } + ch_h5ads.filter { meta, mtx_files -> meta.input_type.contains('raw') } ) ch_h5ads = ch_h5ads.mix( EMPTY_DROPLET_REMOVAL.out.h5ad ) From 0ca1161dbac0722cd0e46afe270cb9cc40260d90 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Fri, 4 Oct 2024 16:19:31 +0200 Subject: [PATCH 40/66] Simplify alevin template --- modules/local/templates/mtx_to_h5ad_alevin.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_alevin.py b/modules/local/templates/mtx_to_h5ad_alevin.py index 50107e0c..db295f1b 100755 --- a/modules/local/templates/mtx_to_h5ad_alevin.py +++ b/modules/local/templates/mtx_to_h5ad_alevin.py @@ -76,11 +76,6 @@ def input_to_adata( adata.write_h5ad(f"{output}", compression="gzip") print(f"Wrote h5ad file to {output}") - # dump versions - dump_versions() - - return adata - # # Run main script # @@ -89,8 +84,11 @@ def input_to_adata( os.makedirs("${meta.id}", exist_ok=True) # input_type comes from NF module -adata = input_to_adata( +input_to_adata( input_data="${meta.id}_alevin_results/af_quant/alevin/", output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}" ) + +# dump versions +dump_versions() From 28251b068a5df1f12661195fbb0b60eb29656337 Mon Sep 17 00:00:00 2001 From: "zxBIB Almeida,Felipe (GCBDS) EXTERNAL" Date: Mon, 7 Oct 2024 09:21:35 +0200 Subject: [PATCH 41/66] reorganise star template --- modules/local/templates/mtx_to_h5ad_star.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py index 49a1837e..0cc9a969 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -72,11 +72,6 @@ def input_to_adata( adata.write_h5ad(f"{output}", compression="gzip") print(f"Wrote h5ad file to {output}") - # dump versions - dump_versions() - - return adata - # # Run main script # @@ -85,8 +80,11 @@ def input_to_adata( os.makedirs("${meta.id}", exist_ok=True) # input_type comes from NF module -adata = input_to_adata( +input_to_adata( input_data="${meta.input_type}", output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}" ) + +# dump versions +dump_versions() From c1e8357112b8e389bab9e5a6503b052b6147594c Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:02:53 +0100 Subject: [PATCH 42/66] write uncompressed Co-authored-by: Gregor Sturm --- modules/local/templates/concat_h5ad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/concat_h5ad.py b/modules/local/templates/concat_h5ad.py index 033bc89a..2a483e7b 100755 --- a/modules/local/templates/concat_h5ad.py +++ b/modules/local/templates/concat_h5ad.py @@ -34,6 +34,6 @@ def read_samplesheet(samplesheet): # merge with data.frame, on sample information adata.obs = adata.obs.join(df_samplesheet, on="sample").astype(str) - adata.write_h5ad("combined_${meta.input_type}_matrix.h5ad", compression="gzip") + adata.write_h5ad("combined_${meta.input_type}_matrix.h5ad") print("Wrote h5ad file to {}".format("combined_${meta.input_type}_matrix.h5ad")) From ae857102b56cc8f68def3bb45b1f167edb672c07 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:04:52 +0100 Subject: [PATCH 43/66] use .astype(str) Co-authored-by: Gregor Sturm --- modules/local/templates/concat_h5ad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/concat_h5ad.py b/modules/local/templates/concat_h5ad.py index 2a483e7b..e2cce156 100755 --- a/modules/local/templates/concat_h5ad.py +++ b/modules/local/templates/concat_h5ad.py @@ -16,7 +16,7 @@ def read_samplesheet(samplesheet): # samplesheet may contain replicates, when it has, # group information from replicates and collapse with commas # only keep unique values using set() - df = df.groupby(["sample"]).agg(lambda column: ",".join(set(str(column)))) + df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column.astype(str))) return df From ae8809a828b5543dc1c5ed968fa057e962fb13bc Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:05:14 +0100 Subject: [PATCH 44/66] simplify iteration Co-authored-by: Gregor Sturm --- modules/local/templates/mtx_to_h5ad_cellranger.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_cellranger.py b/modules/local/templates/mtx_to_h5ad_cellranger.py index 294a15e7..be870829 100755 --- a/modules/local/templates/mtx_to_h5ad_cellranger.py +++ b/modules/local/templates/mtx_to_h5ad_cellranger.py @@ -69,9 +69,7 @@ def input_to_adata( # standard format # index are gene IDs and symbols are a column adata.var['gene_versions'] = adata.var.index - adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] - adata.var.index = adata.var["gene_ids"].values - adata.var = adata.var.drop("gene_ids", axis=1) + adata.var.index = adata.var['gene_versions'].str.split('.').str[0] adata.var_names_make_unique() # write results From 68464de7d8d0a71a827379f2d15d1f8f4b310cfc Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:05:35 +0100 Subject: [PATCH 45/66] perform join left operation Co-authored-by: Gregor Sturm --- modules/local/templates/mtx_to_h5ad_kallisto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py index 24efe7e0..87511823 100755 --- a/modules/local/templates/mtx_to_h5ad_kallisto.py +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -27,7 +27,7 @@ def _mtx_to_adata( txp2gene = pd.read_table(glob.glob(f"{t2g}")[0], header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) txp2gene = txp2gene.drop_duplicates(subset="gene_id").set_index("gene_id") - adata.var = pd.merge(adata.var, txp2gene, left_index=True, right_index=True) + adata.var = adata.var.join(txp2gene, how="left") return adata From fdedc4d3385e59b2f2670a7d16525928f0dc384f Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:05:51 +0100 Subject: [PATCH 46/66] do not compress output h5ad Co-authored-by: Gregor Sturm --- modules/local/templates/mtx_to_h5ad_kallisto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py index 87511823..fab07f59 100755 --- a/modules/local/templates/mtx_to_h5ad_kallisto.py +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -83,7 +83,7 @@ def input_to_adata( adata.var_names_make_unique() # write results - adata.write_h5ad(f"{output}", compression="gzip") + adata.write_h5ad(f"{output}") print(f"Wrote h5ad file to {output}") # From 187dbf6d04d8fe73a4b4cabbca2b957656035ed6 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:06:26 +0100 Subject: [PATCH 47/66] perform join left operation Co-authored-by: Gregor Sturm --- modules/local/templates/concat_h5ad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/concat_h5ad.py b/modules/local/templates/concat_h5ad.py index e2cce156..ac5da7f6 100755 --- a/modules/local/templates/concat_h5ad.py +++ b/modules/local/templates/concat_h5ad.py @@ -33,7 +33,7 @@ def read_samplesheet(samplesheet): adata = ad.concat(dict_of_h5ad, label="sample", merge="unique", index_unique="_") # merge with data.frame, on sample information - adata.obs = adata.obs.join(df_samplesheet, on="sample").astype(str) + adata.obs = adata.obs.join(df_samplesheet, on="sample", how="left").astype(str) adata.write_h5ad("combined_${meta.input_type}_matrix.h5ad") print("Wrote h5ad file to {}".format("combined_${meta.input_type}_matrix.h5ad")) From 98af608abbc3aaa562b052ccecebb0030c9ef0b6 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 08:21:15 +0000 Subject: [PATCH 48/66] not compress h5ad output --- modules/local/templates/mtx_to_h5ad_alevin.py | 2 +- modules/local/templates/mtx_to_h5ad_cellranger.py | 2 +- modules/local/templates/mtx_to_h5ad_star.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_alevin.py b/modules/local/templates/mtx_to_h5ad_alevin.py index db295f1b..a4563d6b 100755 --- a/modules/local/templates/mtx_to_h5ad_alevin.py +++ b/modules/local/templates/mtx_to_h5ad_alevin.py @@ -73,7 +73,7 @@ def input_to_adata( adata.var_names_make_unique() # write results - adata.write_h5ad(f"{output}", compression="gzip") + adata.write_h5ad(f"{output}") print(f"Wrote h5ad file to {output}") # diff --git a/modules/local/templates/mtx_to_h5ad_cellranger.py b/modules/local/templates/mtx_to_h5ad_cellranger.py index be870829..4a40201d 100755 --- a/modules/local/templates/mtx_to_h5ad_cellranger.py +++ b/modules/local/templates/mtx_to_h5ad_cellranger.py @@ -73,7 +73,7 @@ def input_to_adata( adata.var_names_make_unique() # write results - adata.write_h5ad(f"{output}", compression="gzip") + adata.write_h5ad(f"{output}") print(f"Wrote h5ad file to {output}") # dump versions diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py index 0cc9a969..5614e698 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -69,7 +69,7 @@ def input_to_adata( adata.var = adata.var.drop("gene_ids", axis=1) # write results - adata.write_h5ad(f"{output}", compression="gzip") + adata.write_h5ad(f"{output}") print(f"Wrote h5ad file to {output}") # From efd6299bd54cee59d68568f6fab20f1d9339eb7b Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 08:22:03 +0000 Subject: [PATCH 49/66] simplify iteration --- modules/local/templates/mtx_to_h5ad_alevin.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_alevin.py b/modules/local/templates/mtx_to_h5ad_alevin.py index a4563d6b..d36d8797 100755 --- a/modules/local/templates/mtx_to_h5ad_alevin.py +++ b/modules/local/templates/mtx_to_h5ad_alevin.py @@ -67,9 +67,7 @@ def input_to_adata( # index are gene IDs and symbols are a column # TODO: how to get gene_symbols for alevin? adata.var['gene_versions'] = adata.var.index - adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] - adata.var.index = adata.var["gene_ids"].values - adata.var = adata.var.drop("gene_ids", axis=1) + adata.var.index = adata.var['gene_versions'].str.split('.').str[0] adata.var_names_make_unique() # write results From f357fd7ccbed89b715d1f0e1fa7b7b9ca9efeda1 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 08:23:53 +0000 Subject: [PATCH 50/66] simplify index iteration --- modules/local/templates/mtx_to_h5ad_kallisto.py | 4 +--- modules/local/templates/mtx_to_h5ad_star.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py index fab07f59..ac71fc42 100755 --- a/modules/local/templates/mtx_to_h5ad_kallisto.py +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -77,9 +77,7 @@ def input_to_adata( # standard format # index are gene IDs and symbols are a column adata.var['gene_versions'] = adata.var.index - adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] - adata.var.index = adata.var["gene_ids"].values - adata.var = adata.var.drop("gene_ids", axis=1) + adata.var.index = adata.var['gene_versions'].str.split('.').str[0] adata.var_names_make_unique() # write results diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py index 5614e698..aa0cd02f 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -64,9 +64,7 @@ def input_to_adata( # index are gene IDs and symbols are a column adata.var["gene_symbol"] = adata.var.index adata.var['gene_versions'] = adata.var["gene_ids"] - adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0] - adata.var.index = adata.var["gene_ids"].values - adata.var = adata.var.drop("gene_ids", axis=1) + adata.var.index = adata.var['gene_versions'].str.split('.').str[0] # write results adata.write_h5ad(f"{output}") From bd1a74cebe0fd8bcf725f2f8be2a461cedb598ea Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 08:24:22 +0000 Subject: [PATCH 51/66] fix unmatched parenthesis --- modules/local/templates/concat_h5ad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/concat_h5ad.py b/modules/local/templates/concat_h5ad.py index ac5da7f6..087f7fde 100755 --- a/modules/local/templates/concat_h5ad.py +++ b/modules/local/templates/concat_h5ad.py @@ -16,7 +16,7 @@ def read_samplesheet(samplesheet): # samplesheet may contain replicates, when it has, # group information from replicates and collapse with commas # only keep unique values using set() - df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column.astype(str))) + df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column.astype(str)))) return df From 4731e009f2264b64356f490f58aa1479ac3dbc31 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 08:50:33 +0000 Subject: [PATCH 52/66] fix use of igenomes ... pipeline was not properly selecting igenomes options --- .../utils_nfcore_scrnaseq_pipeline/main.nf | 5 +++- workflows/scrnaseq.nf | 29 +++++++++---------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/utils_nfcore_scrnaseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_scrnaseq_pipeline/main.nf index dd54d352..16569d5a 100644 --- a/subworkflows/local/utils_nfcore_scrnaseq_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_scrnaseq_pipeline/main.nf @@ -202,9 +202,12 @@ def getGenomeAttribute(attribute) { if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { if (params.genomes[ params.genome ].containsKey(attribute)) { return params.genomes[ params.genome ][ attribute ] + } else { + return null } + } else { + return null } - return null } // diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 3f3c1017..61401185 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -31,12 +31,11 @@ workflow SCRNASEQ { error "Only cellranger supports `protocol = 'auto'`. Please specify the protocol manually!" } - params.fasta = getGenomeAttribute('fasta') - params.gtf = getGenomeAttribute('gtf') - params.star_index = getGenomeAttribute('star') - - ch_genome_fasta = params.fasta ? file(params.fasta, checkIfExists: true) : [] - ch_gtf = params.gtf ? file(params.gtf, checkIfExists: true) : [] + // search igenomes, but overwrite with user paths + // cannot use 'params. = ' in workflow, it does not overwrite parameter + def fasta_file = params.fasta ? params.fasta : getGenomeAttribute('fasta') + def gtf_file = params.gtf ? params.gtf : getGenomeAttribute('gtf') + def star_index = params.star_index ? params.star_index : getGenomeAttribute('star') // general input and params ch_transcript_fasta = params.transcript_fasta ? file(params.transcript_fasta): [] @@ -70,7 +69,7 @@ workflow SCRNASEQ { ch_salmon_index = params.salmon_index ? file(params.salmon_index) : [] //star params - star_index = params.star_index ? file(params.star_index, checkIfExists: true) : null + star_index = star_index ? file(star_index, checkIfExists: true) : null ch_star_index = star_index ? [[id: star_index.baseName], star_index] : [] star_feature = params.star_feature @@ -98,24 +97,24 @@ workflow SCRNASEQ { // // Uncompress genome fasta file if required // - if (params.fasta) { - if (params.fasta.endsWith('.gz')) { - ch_genome_fasta = GUNZIP_FASTA ( [ [:], file(params.fasta) ] ).gunzip.map { it[1] } + if (fasta_file) { + if (fasta_file.endsWith('.gz')) { + ch_genome_fasta = GUNZIP_FASTA ( [ [:], file(fasta_file) ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) } else { - ch_genome_fasta = Channel.value( file(params.fasta) ) + ch_genome_fasta = Channel.value( file(fasta_file) ) } } // // Uncompress GTF annotation file or create from GFF3 if required // - if (params.gtf) { - if (params.gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ( [ [:], file(params.gtf) ] ).gunzip.map { it[1] } + if (gtf_file) { + if (gtf_file.endsWith('.gz')) { + ch_gtf = GUNZIP_GTF ( [ [:], file(gtf_file) ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } else { - ch_gtf = Channel.value( file(params.gtf) ) + ch_gtf = Channel.value( file(gtf_file) ) } } From a71348fe887a3798d07a391a7c42f53973e308d8 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:44:43 +0000 Subject: [PATCH 53/66] correct values parsing --- modules/local/templates/mtx_to_h5ad_alevin.py | 2 +- modules/local/templates/mtx_to_h5ad_cellranger.py | 2 +- modules/local/templates/mtx_to_h5ad_kallisto.py | 2 +- modules/local/templates/mtx_to_h5ad_star.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_alevin.py b/modules/local/templates/mtx_to_h5ad_alevin.py index d36d8797..83cbd946 100755 --- a/modules/local/templates/mtx_to_h5ad_alevin.py +++ b/modules/local/templates/mtx_to_h5ad_alevin.py @@ -67,7 +67,7 @@ def input_to_adata( # index are gene IDs and symbols are a column # TODO: how to get gene_symbols for alevin? adata.var['gene_versions'] = adata.var.index - adata.var.index = adata.var['gene_versions'].str.split('.').str[0] + adata.var.index = adata.var['gene_versions'].str.split('.').str[0].values adata.var_names_make_unique() # write results diff --git a/modules/local/templates/mtx_to_h5ad_cellranger.py b/modules/local/templates/mtx_to_h5ad_cellranger.py index 4a40201d..76ec4998 100755 --- a/modules/local/templates/mtx_to_h5ad_cellranger.py +++ b/modules/local/templates/mtx_to_h5ad_cellranger.py @@ -69,7 +69,7 @@ def input_to_adata( # standard format # index are gene IDs and symbols are a column adata.var['gene_versions'] = adata.var.index - adata.var.index = adata.var['gene_versions'].str.split('.').str[0] + adata.var.index = adata.var['gene_versions'].str.split('.').str[0].values adata.var_names_make_unique() # write results diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py index ac71fc42..0fc606ba 100755 --- a/modules/local/templates/mtx_to_h5ad_kallisto.py +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -77,7 +77,7 @@ def input_to_adata( # standard format # index are gene IDs and symbols are a column adata.var['gene_versions'] = adata.var.index - adata.var.index = adata.var['gene_versions'].str.split('.').str[0] + adata.var.index = adata.var['gene_versions'].str.split('.').str[0].values adata.var_names_make_unique() # write results diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py index aa0cd02f..b4c71e9e 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -64,7 +64,7 @@ def input_to_adata( # index are gene IDs and symbols are a column adata.var["gene_symbol"] = adata.var.index adata.var['gene_versions'] = adata.var["gene_ids"] - adata.var.index = adata.var['gene_versions'].str.split('.').str[0] + adata.var.index = adata.var['gene_versions'].str.split('.').str[0].values # write results adata.write_h5ad(f"{output}") From d2f9cfdfa0c1af8e0c14f1b99dd7a9e071ff7452 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 09:50:20 +0000 Subject: [PATCH 54/66] fix container registry --- modules/local/anndatar_convert.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf index d6a8271d..8fb6dac3 100644 --- a/modules/local/anndatar_convert.nf +++ b/modules/local/anndatar_convert.nf @@ -3,7 +3,7 @@ process ANNDATAR_CONVERT { label 'process_medium' - container "docker://fmalmeida/anndatar:dev" // TODO: Fix + container "docker.io/fmalmeida/anndatar:dev" // TODO: Fix input: tuple val(meta), path(h5ad) From 5c31226a18f9d57d9f6d5971b480095f81d66a4c Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:13:49 +0000 Subject: [PATCH 55/66] do not save versions files --- conf/modules.config | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a6fc3411..4a230680 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -80,13 +80,15 @@ if(params.aligner == "cellranger") { withName: CELLRANGER_MKREF { publishDir = [ path: "${params.outdir}/${params.aligner}/mkref", - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } withName: CELLRANGER_COUNT { publishDir = [ path: "${params.outdir}/${params.aligner}/count", - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = {"--chemistry ${meta.chemistry} --create-bam true " + (meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : '')} time = { check_max( 240.h * task.attempt, 'time' ) } @@ -183,20 +185,20 @@ if (params.aligner == "alevin") { if (params.aligner == "star") { process { - withName: STAR_ALIGN { - ext.args = "--readFilesCommand zcat --runDirPerm All_RWX --outWigType bedGraph --twopassMode Basic --outSAMtype BAM SortedByCoordinate" - } withName: STAR_GENOMEGENERATE { publishDir = [ path: { "${params.outdir}/${params.aligner}/genome_generate" }, mode: params.publish_dir_mode, - enabled: params.save_reference + enabled: params.save_reference, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } withName: STAR_ALIGN { + ext.args = "--readFilesCommand zcat --runDirPerm All_RWX --outWigType bedGraph --twopassMode Basic --outSAMtype BAM SortedByCoordinate" publishDir = [ path: { "${params.outdir}/${params.aligner}/${meta.id}" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } } @@ -208,14 +210,16 @@ if (params.aligner == 'kallisto') { publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode, - enabled: params.save_reference + enabled: params.save_reference, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } withName: KALLISTOBUSTOOLS_COUNT { def kb_filter = (params.kb_filter) ? '--filter' : '' publishDir = [ path: { "${params.outdir}/${params.aligner}" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = "--workflow ${params.kb_workflow} ${kb_filter}" } @@ -254,7 +258,8 @@ if (params.aligner == 'cellrangermulti') { withName: CELLRANGER_MKVDJREF { publishDir = [ path: "${params.outdir}/${params.aligner}/mkvdjref", - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } } From 82784295364ffc436afd2cb7ac35258f59763efa Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:13:59 +0000 Subject: [PATCH 56/66] also convert concat h5ads --- modules/local/concat_h5ad.nf | 2 +- subworkflows/local/mtx_conversion.nf | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf index 41310553..393a7353 100644 --- a/modules/local/concat_h5ad.nf +++ b/modules/local/concat_h5ad.nf @@ -11,7 +11,7 @@ process CONCAT_H5AD { path samplesheet output: - path "*.h5ad", emit: h5ad + tuple val(meta), path("*.h5ad"), emit: h5ad when: task.ext.when == null || task.ext.when diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 55bd68dc..9891536d 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -56,13 +56,6 @@ workflow MTX_CONVERSION { } - // - // MODULE: Convert to Rds with AnndataR package - // - ANNDATAR_CONVERT ( - ch_h5ads - ) - // // Concat sample-specific h5ad in one // @@ -78,6 +71,18 @@ workflow MTX_CONVERSION { ch_concat_h5ad_input, samplesheet ) + ch_h5ad_concat = CONCAT_H5AD.out.h5ad.map{ meta, file -> + def meta_clone = meta.clone() + meta_clone.id = 'combined' // maintain output prefix + [ meta_clone, file ] + } + + // + // MODULE: Convert to Rds with AnndataR package + // + ANNDATAR_CONVERT ( + ch_h5ads.mix( ch_h5ad_concat ) + ) //TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output // ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions) From 2b199d053d011c785b71d45cb1d04ba8eaf2f440 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:23:02 +0000 Subject: [PATCH 57/66] manage subdirectory in publishDir --- conf/modules.config | 6 +++++- modules/local/anndatar_convert.nf | 2 +- modules/local/mtx_to_h5ad.nf | 4 ++-- modules/local/templates/anndatar_convert.R | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 4a230680..ed0c1b92 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -53,7 +53,11 @@ process { publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> + if (!filename.contains('combined_')) { "${meta.id}/${filename}" } + else if (filename.equals('versions.yml')) { null } + else filename + } ] } diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf index 8fb6dac3..12a2b1b4 100644 --- a/modules/local/anndatar_convert.nf +++ b/modules/local/anndatar_convert.nf @@ -9,7 +9,7 @@ process ANNDATAR_CONVERT { tuple val(meta), path(h5ad) output: - tuple val(meta), path("${meta.id}/${meta.id}_${meta.input_type}_matrix.Rds"), emit: rds + tuple val(meta), path("${meta.id}_${meta.input_type}_matrix.Rds"), emit: rds when: task.ext.when == null || task.ext.when diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 751cf9a5..ea56c7ae 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -13,8 +13,8 @@ process MTX_TO_H5AD { path star_index output: - tuple val(meta), path("${meta.id}/*h5ad"), emit: h5ad - path "versions.yml" , emit: versions + tuple val(meta), path("${meta.id}_${meta.input_type}_matrix.h5ad"), emit: h5ad + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/templates/anndatar_convert.R b/modules/local/templates/anndatar_convert.R index 06a723d7..5be46163 100755 --- a/modules/local/templates/anndatar_convert.R +++ b/modules/local/templates/anndatar_convert.R @@ -13,4 +13,4 @@ obj <- adata\$to_Seurat() # save files dir.create(file.path("$meta.id"), showWarnings = FALSE) -saveRDS(obj, file = "${meta.id}/${meta.id}_${meta.input_type}_matrix.Rds") +saveRDS(obj, file = "${meta.id}_${meta.input_type}_matrix.Rds") From cb677974e480ef386349efc70f7f4ac79b7bb75b Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:24:35 +0000 Subject: [PATCH 58/66] match template scripts with new publishDir --- modules/local/templates/mtx_to_h5ad_alevin.py | 2 +- modules/local/templates/mtx_to_h5ad_cellranger.py | 2 +- modules/local/templates/mtx_to_h5ad_kallisto.py | 4 ++-- modules/local/templates/mtx_to_h5ad_star.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/local/templates/mtx_to_h5ad_alevin.py b/modules/local/templates/mtx_to_h5ad_alevin.py index 83cbd946..d54a1667 100755 --- a/modules/local/templates/mtx_to_h5ad_alevin.py +++ b/modules/local/templates/mtx_to_h5ad_alevin.py @@ -84,7 +84,7 @@ def input_to_adata( # input_type comes from NF module input_to_adata( input_data="${meta.id}_alevin_results/af_quant/alevin/", - output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + output="${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}" ) diff --git a/modules/local/templates/mtx_to_h5ad_cellranger.py b/modules/local/templates/mtx_to_h5ad_cellranger.py index 76ec4998..ecc5c077 100755 --- a/modules/local/templates/mtx_to_h5ad_cellranger.py +++ b/modules/local/templates/mtx_to_h5ad_cellranger.py @@ -91,6 +91,6 @@ def input_to_adata( # input_type comes from NF module adata = input_to_adata( input_data="${meta.input_type}_feature_bc_matrix.h5", - output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + output="${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}" ) diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py index 0fc606ba..37f66c5f 100755 --- a/modules/local/templates/mtx_to_h5ad_kallisto.py +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -97,7 +97,7 @@ def input_to_adata( matrix=glob.glob("${inputs}/*.mtx")[0], barcodes=glob.glob("${inputs}/*.barcodes.txt")[0], features=glob.glob("${inputs}/*.genes.txt")[0], - output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + output="${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}", t2g="${txp2gene}" ) @@ -108,7 +108,7 @@ def input_to_adata( matrix=glob.glob("${inputs}/" + f"{type}*.mtx")[0], barcodes=glob.glob("${inputs}/" + f"{type}*.barcodes.txt")[0], features=glob.glob("${inputs}/" + f"{type}*.genes.txt")[0], - output="${meta.id}/${meta.id}_${meta.input_type}" + f"_{type}_matrix.h5ad", + output="${meta.id}_${meta.input_type}" + f"_{type}_matrix.h5ad", sample="${meta.id}", t2g="${txp2gene}" ) diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py index b4c71e9e..e44d2478 100755 --- a/modules/local/templates/mtx_to_h5ad_star.py +++ b/modules/local/templates/mtx_to_h5ad_star.py @@ -80,7 +80,7 @@ def input_to_adata( # input_type comes from NF module input_to_adata( input_data="${meta.input_type}", - output="${meta.id}/${meta.id}_${meta.input_type}_matrix.h5ad", + output="${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}" ) From 7230095677ba87f5325ed7bd4d853b00ee4255fe Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:31:25 +0000 Subject: [PATCH 59/66] have outputs separated --- subworkflows/local/starsolo.nf | 14 +++++--------- workflows/scrnaseq.nf | 4 +++- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index d69ce9d9..4cbf7aea 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -54,16 +54,12 @@ workflow STARSOLO { ) ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) - // generate channel of star counts with correct metadata - ch_star_counts = - STAR_ALIGN.out.raw_counts.map{ meta, files -> [meta + [input_type: 'raw'], files] } - .mix( STAR_ALIGN.out.filtered_counts.map{ meta, files -> [meta + [input_type: 'filtered'], files] } ) - - emit: ch_versions // get rid of meta for star index - star_result = STAR_ALIGN.out.tab - star_counts = ch_star_counts - for_multiqc = STAR_ALIGN.out.log_final.map{ meta, it -> it } + star_result = STAR_ALIGN.out.tab + star_counts = STAR_ALIGN.out.counts + raw_counts = STAR_ALIGN.out.raw_counts + filtered_counts = STAR_ALIGN.out.filtered_counts + for_multiqc = STAR_ALIGN.out.log_final.map{ meta, it -> it } } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 61401185..f7bbe692 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -170,7 +170,9 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) - ch_mtx_matrices = STARSOLO.out.star_counts + ch_mtx_matrices = + STARSOLO.out.raw_counts.map{ meta, files -> [meta + [input_type: 'raw'], files] } + .mix( STARSOLO.out.filtered_counts.map{ meta, files -> [meta + [input_type: 'filtered'], files] } ) } // Run cellranger pipeline From 206a7c1d8f50914d70a873bf66dfd184219d17f5 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:36:48 +0000 Subject: [PATCH 60/66] make parsing inside sub-workflow --- subworkflows/local/starsolo.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 4cbf7aea..aadda6b6 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -59,7 +59,7 @@ workflow STARSOLO { // get rid of meta for star index star_result = STAR_ALIGN.out.tab star_counts = STAR_ALIGN.out.counts - raw_counts = STAR_ALIGN.out.raw_counts - filtered_counts = STAR_ALIGN.out.filtered_counts + raw_counts = STAR_ALIGN.out.raw_counts.map{ meta, files -> [meta + [input_type: 'raw'], files] } + filtered_counts = STAR_ALIGN.out.filtered_counts.map{ meta, files -> [meta + [input_type: 'filtered'], files] } for_multiqc = STAR_ALIGN.out.log_final.map{ meta, it -> it } } From c4a09e07baba8b2a1176e6e8429a3f133128cd55 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:43:28 +0000 Subject: [PATCH 61/66] added kallisto to correct structure --- subworkflows/local/kallisto_bustools.nf | 4 +++- workflows/scrnaseq.nf | 6 ++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf index d6b171a6..5f8d9bcc 100644 --- a/subworkflows/local/kallisto_bustools.nf +++ b/subworkflows/local/kallisto_bustools.nf @@ -66,7 +66,9 @@ workflow KALLISTO_BUSTOOLS { emit: ch_versions - counts = ch_raw_counts.mix (ch_filtered_counts) + counts = KALLISTOBUSTOOLS_COUNT.out.count + counts_raw = ch_raw_counts + counts_filtered = ch_filtered_counts txp2gene = txp2gene.collect() } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index f7bbe692..070565f4 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -135,7 +135,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) - ch_mtx_matrices = KALLISTO_BUSTOOLS.out.counts + ch_mtx_matrices = KALLISTO_BUSTOOLS.out.counts_raw.mix( KALLISTO_BUSTOOLS.out.counts_filtered ) ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } @@ -170,9 +170,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) - ch_mtx_matrices = - STARSOLO.out.raw_counts.map{ meta, files -> [meta + [input_type: 'raw'], files] } - .mix( STARSOLO.out.filtered_counts.map{ meta, files -> [meta + [input_type: 'filtered'], files] } ) + ch_mtx_matrices = STARSOLO.out.raw_counts.mix( STARSOLO.out.filtered_counts ) } // Run cellranger pipeline From 971667b454be7960ccbc57a9eb5bef7239fa6beb Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:47:33 +0000 Subject: [PATCH 62/66] correct mix of channels --- subworkflows/local/align_cellranger.nf | 7 ++++--- workflows/scrnaseq.nf | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf index 86b69a38..d787e0f0 100644 --- a/subworkflows/local/align_cellranger.nf +++ b/subworkflows/local/align_cellranger.nf @@ -63,7 +63,8 @@ workflow CELLRANGER_ALIGN { emit: ch_versions - cellranger_out = CELLRANGER_COUNT.out.outs - cellranger_matrices = ch_matrices_raw.mix( ch_matrices_filtered ) - star_index = cellranger_index + cellranger_out = CELLRANGER_COUNT.out.outs + cellranger_matrices_raw = ch_matrices_raw + cellranger_matrices_filtered = ch_matrices_filtered + star_index = cellranger_index } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 070565f4..20776d6b 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -135,7 +135,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) - ch_mtx_matrices = KALLISTO_BUSTOOLS.out.counts_raw.mix( KALLISTO_BUSTOOLS.out.counts_filtered ) + ch_mtx_matrices = ch_mtx_matrices.mix( KALLISTO_BUSTOOLS.out.counts_raw, KALLISTO_BUSTOOLS.out.counts_filtered ) ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } @@ -153,7 +153,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_ALEVIN.out.alevin_results.map{ meta, it -> it }) - ch_mtx_matrices = ch_mtx_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_results) + ch_mtx_matrices = ch_mtx_matrices.mix( SCRNASEQ_ALEVIN.out.alevin_results ) } // Run STARSolo pipeline @@ -170,7 +170,7 @@ workflow SCRNASEQ { ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) - ch_mtx_matrices = STARSOLO.out.raw_counts.mix( STARSOLO.out.filtered_counts ) + ch_mtx_matrices = ch_mtx_matrices.mix( STARSOLO.out.raw_counts, STARSOLO.out.filtered_counts ) } // Run cellranger pipeline @@ -183,7 +183,7 @@ workflow SCRNASEQ { protocol_config['protocol'] ) ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_matrices) + ch_mtx_matrices = ch_mtx_matrices.mix( CELLRANGER_ALIGN.out.cellranger_matrices_raw, CELLRANGER_ALIGN.out.cellranger_matrices_filtered ) ch_multiqc_files = ch_multiqc_files.mix(CELLRANGER_ALIGN.out.cellranger_out.map { meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} }) From 81221135336857206c7eaa24cd58dfc1c1b17810 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 10:49:17 +0000 Subject: [PATCH 63/66] correct for cellranger multi --- subworkflows/local/align_cellrangermulti.nf | 5 +++-- workflows/scrnaseq.nf | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/align_cellrangermulti.nf b/subworkflows/local/align_cellrangermulti.nf index 977bf478..f13c7bf1 100644 --- a/subworkflows/local/align_cellrangermulti.nf +++ b/subworkflows/local/align_cellrangermulti.nf @@ -204,8 +204,9 @@ workflow CELLRANGER_MULTI_ALIGN { emit: ch_versions - cellrangermulti_out = CELLRANGER_MULTI.out.outs - cellrangermulti_mtx = ch_matrices_raw.mix( ch_matrices_filtered ) + cellrangermulti_out = CELLRANGER_MULTI.out.outs + cellrangermulti_mtx_raw = ch_matrices_raw + cellrangermulti_mtx_filtered = ch_matrices_filtered } def parse_demultiplexed_output_channels(in_ch, pattern) { diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 20776d6b..418f6711 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -281,7 +281,7 @@ workflow SCRNASEQ { ch_multiqc_files = ch_multiqc_files.mix( CELLRANGER_MULTI_ALIGN.out.cellrangermulti_out.map{ meta, outs -> outs.findAll{ it -> it.name == "web_summary.html" } }) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx) + ch_mtx_matrices = ch_mtx_matrices.mix( CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx_raw, CELLRANGER_MULTI_ALIGN.out.cellrangermulti_mtx_filtered ) } From 44d1cb745c0d2907a97b992df0f217313ad8333b Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 11:04:24 +0000 Subject: [PATCH 64/66] correct stub --- modules/local/mtx_to_h5ad.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index ea56c7ae..27e1070a 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -26,8 +26,7 @@ process MTX_TO_H5AD { stub: """ - mkdir ${meta.id} - touch ${meta.id}/${meta.id}_matrix.h5ad + touch ${meta.id}_raw_matrix.h5ad touch versions.yml """ } From 18cfdd7571ddb80234ff7e6ed2c58f6360605292 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 11:04:53 +0000 Subject: [PATCH 65/66] remove glob in txp2gene --- modules/local/templates/mtx_to_h5ad_kallisto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/mtx_to_h5ad_kallisto.py b/modules/local/templates/mtx_to_h5ad_kallisto.py index 37f66c5f..8d0f0909 100755 --- a/modules/local/templates/mtx_to_h5ad_kallisto.py +++ b/modules/local/templates/mtx_to_h5ad_kallisto.py @@ -25,7 +25,7 @@ def _mtx_to_adata( adata.var_names = pd.read_csv(features, header=None, sep="\\t")[0].values adata.obs["sample"] = sample - txp2gene = pd.read_table(glob.glob(f"{t2g}")[0], header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) + txp2gene = pd.read_table(f"{t2g}", header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) txp2gene = txp2gene.drop_duplicates(subset="gene_id").set_index("gene_id") adata.var = adata.var.join(txp2gene, how="left") From 079bb7ec7a7dd5f4a2009463106e785fabbdd178 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 30 Oct 2024 11:15:13 +0000 Subject: [PATCH 66/66] added small comment to local modules --- modules/local/adata_barcodes.nf | 6 ++++++ modules/local/alevinqc.nf | 5 +++++ modules/local/anndatar_convert.nf | 5 +++++ modules/local/concat_h5ad.nf | 6 ++++++ modules/local/gffread_transcriptome.nf | 5 +++++ modules/local/gtf_gene_filter.nf | 5 +++++ modules/local/mtx_to_h5ad.nf | 5 +++++ modules/local/parse_cellrangermulti_samplesheet.nf | 5 +++++ modules/local/simpleaf_index.nf | 5 +++++ modules/local/simpleaf_quant.nf | 5 +++++ modules/local/star_align.nf | 5 +++++ 11 files changed, 57 insertions(+) diff --git a/modules/local/adata_barcodes.nf b/modules/local/adata_barcodes.nf index 2aef8ec9..630d90ae 100644 --- a/modules/local/adata_barcodes.nf +++ b/modules/local/adata_barcodes.nf @@ -1,4 +1,10 @@ process ADATA_BARCODES { + + // + // Module from nf-core/scdownstream. + // This module performs the subset of the h5ad file to only contain barcodes that passed emptydrops filter with cellbender + // + tag "$meta.id" label 'process_single' diff --git a/modules/local/alevinqc.nf b/modules/local/alevinqc.nf index 9000d79e..777a1371 100644 --- a/modules/local/alevinqc.nf +++ b/modules/local/alevinqc.nf @@ -1,4 +1,9 @@ process ALEVINQC { + + // + // This module executes alevinfry QC reporting tool on alevin results + // + tag "$meta.id" label 'process_low' diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf index 12a2b1b4..8e4c242e 100644 --- a/modules/local/anndatar_convert.nf +++ b/modules/local/anndatar_convert.nf @@ -1,4 +1,9 @@ process ANNDATAR_CONVERT { + + // + // This module uses the anndata R package to convert h5ad files in different formats + // + tag "${meta.id}" label 'process_medium' diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf index 393a7353..c875ba3c 100644 --- a/modules/local/concat_h5ad.nf +++ b/modules/local/concat_h5ad.nf @@ -1,4 +1,10 @@ process CONCAT_H5AD { + + // + // This module concatenates all h5ad, per type (raw, filtered, etc.) files generated during pipeline execution + // + + tag "${meta.id}" label 'process_medium' diff --git a/modules/local/gffread_transcriptome.nf b/modules/local/gffread_transcriptome.nf index ab573b07..671b6726 100644 --- a/modules/local/gffread_transcriptome.nf +++ b/modules/local/gffread_transcriptome.nf @@ -1,4 +1,9 @@ process GFFREAD_TRANSCRIPTOME { + + // + // This module uses gffread to filter input to generate a transcripts fasta + // + tag "${genome_fasta}" label 'process_low' diff --git a/modules/local/gtf_gene_filter.nf b/modules/local/gtf_gene_filter.nf index 063bd228..10af352b 100644 --- a/modules/local/gtf_gene_filter.nf +++ b/modules/local/gtf_gene_filter.nf @@ -1,4 +1,9 @@ process GTF_GENE_FILTER { + + // + // This module executes a custom script to filter input gtf to contain only annotations present in input genome + // + tag "$fasta" label 'process_low' diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 27e1070a..f54318f1 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -1,4 +1,9 @@ process MTX_TO_H5AD { + + // + // This module executes different conversion template scripts (per aligner) for converting output mtx files into h5ad files + // + tag "$meta.id" label 'process_medium' diff --git a/modules/local/parse_cellrangermulti_samplesheet.nf b/modules/local/parse_cellrangermulti_samplesheet.nf index df616995..e8f56b67 100644 --- a/modules/local/parse_cellrangermulti_samplesheet.nf +++ b/modules/local/parse_cellrangermulti_samplesheet.nf @@ -1,4 +1,9 @@ process PARSE_CELLRANGERMULTI_SAMPLESHEET { + + // + // This module contains a custom script for checking special cellranger multi samplesheet + // + label 'process_low' publishDir = [ enabled: false ] diff --git a/modules/local/simpleaf_index.nf b/modules/local/simpleaf_index.nf index 8e8bd519..5c362c99 100644 --- a/modules/local/simpleaf_index.nf +++ b/modules/local/simpleaf_index.nf @@ -1,4 +1,9 @@ process SIMPLEAF_INDEX { + + // + // This module executes simpleaf to generate alevin genome index + // + tag "$transcript_gtf" label "process_medium" diff --git a/modules/local/simpleaf_quant.nf b/modules/local/simpleaf_quant.nf index abb58404..9241b210 100644 --- a/modules/local/simpleaf_quant.nf +++ b/modules/local/simpleaf_quant.nf @@ -1,4 +1,9 @@ process SIMPLEAF_QUANT { + + // + // This module executes simpleaf to perform quantification with alevin + // + tag "$meta.id" label 'process_high' diff --git a/modules/local/star_align.nf b/modules/local/star_align.nf index 4b3df1e1..70d6770c 100644 --- a/modules/local/star_align.nf +++ b/modules/local/star_align.nf @@ -1,4 +1,9 @@ process STAR_ALIGN { + + // + // This module executes STAR align quantification + // + tag "$meta.id" label 'process_high'