diff --git a/assets/schema_tools.json b/assets/schema_tools.json new file mode 100644 index 00000000..1820b94e --- /dev/null +++ b/assets/schema_tools.json @@ -0,0 +1,61 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "title": "nf-core/differentialabundance - params.tools schema", + "description": "Schema for the file provided with params.tools", + "type": "array", + "items": { + "type": "object", + "properties": { + "pathway_name": { + "type": "string", + "meta": ["pathway_name"] + }, + "diff_method": { + "type": "string", + "errorMessage": "choose propd, DESeq2 or none", + "meta": ["diff_method"] + }, + "args_diff": { + "type": "string", + "meta": ["args_diff"] + }, + "enr_diff_method": { + "type": "string", + "meta": ["enr_diff_method"], + "errorMessage": "choose grea, gsea or none" + }, + "args_enr_diff": { + "type": "string", + "meta": ["args_enr_diff"] + }, + "cor_method": { + "type": "string", + "meta": ["cor_method"], + "errorMessage": "choose correlation,proportionality, partial correlation or none" + }, + "args_cor": { + "type": "string", + "meta": ["args_cor"] + }, + "enr_cor_method": { + "type": "string", + "meta": ["enr_cor_method"], + "errorMessage": "choose grea or none" + }, + "args_enr_cor": { + "type": "string", + "meta": ["args_enr_cor"] + }, + "sel_method": { + "type": "string", + "meta": ["sel_method"], + "errorMessage": "choose filtervar or none" + }, + "args_sel": { + "type": "string", + "meta": ["args_sel"] + } + }, + "required": [] + } +} diff --git a/assets/tools_samplesheet.csv b/assets/tools_samplesheet.csv new file mode 100644 index 00000000..1c16d64b --- /dev/null +++ b/assets/tools_samplesheet.csv @@ -0,0 +1,6 @@ +pathway_name,diff_method,args_diff,enr_diff_method,args_enr_diff,cor_method,args_cor,enr_cor_method,args_enr_cor,sel_method,args_sel +diff_prop,propd, --adjacency true --permutation 100 --fixseed true, , ,,, , ,, +filtered_pcor,propd, --adjacency true --permutation 100 --fixseed true, , ,propr, --permutation 10 --adjacency true --cutoff_min 0.005 --cutoff_max 0.5 --cutoff_interval 0.01 --metric pcor.bshrink, , , filtervar, +prop,, , , ,propr, --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.05 --fixseed true --metric rho --permutation 100 --adjacency true, , , , +diff_grea,propd,--group_col fase --adjacency true --permutation 10,grea, --permutation 10, , , , ,, +deseq2,deseq2,,gsea,,,,,,, diff --git a/conf/crg.config b/conf/crg.config new file mode 100644 index 00000000..d1f6b419 --- /dev/null +++ b/conf/crg.config @@ -0,0 +1,41 @@ +params { + config_profile_name = 'CRG profile' + config_profile_description = 'Configuration to run on CRG cluster' + + max_cpus = 64 + max_memory = 100.GB + max_time = 48.h +} + + +process { + executor = 'crg' + //maxRetries = params.max_retries + //errorStrategy = params.err_start + + withLabel:process_low { + queue = 'cn-el7,short-centos79' + cpus = { check_max( 2 , 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_medium{ + queue = 'cn-el7,short-centos79' + cpus = { check_max( 6 , 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + withLabel:process_high { + queue = 'cn-el7,long-centos79' + cpus = { check_max( 12 , 'cpus' ) } + memory = { check_max( 56.GB * task.attempt, 'memory' ) } + time = { check_max( 12.h * task.attempt, 'time' ) } + + } +} + + +singularity { + enabled = true + cacheDir = 'singularity_cache' +} diff --git a/conf/modules_coda.config b/conf/modules_coda.config new file mode 100644 index 00000000..44d272e4 --- /dev/null +++ b/conf/modules_coda.config @@ -0,0 +1,45 @@ +process { + withName: "PROPR"{ + ext.args = { "${meta.args_cor}" == "null" ? '' : "${meta.args_cor}" } + publishDir = [ + path: { "${params.outdir}/correlation_analysis/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: "PROPD"{ + ext.args = { "${meta.args_diff}" == "null" ? '' : "${meta.args_diff}" } + publishDir = [ + path: { "${params.outdir}/differential_analysis/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "FILTERVAR"{ + ext.args = { "${meta.args_cor}" == "null" ? '' : "${meta.args_cor}" } + publishDir = [ + path: { "${params.outdir}/variable_selection/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "GREA_DIFF"{ + ext.args = { "${meta.args_enr_diff}" == "null" ? '' : "${meta.args_enr_diff}" } + publishDir = [ + path: { "${params.outdir}/enrichment_differential/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "GREA_COR"{ + ext.args = { "${meta.args_enr_cor}" == "null" ? '' : "${meta.args_enr_cor}" } + publishDir = [ + path: { "${params.outdir}/enrichment_correlation/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} diff --git a/main_coda.nf b/main_coda.nf new file mode 100644 index 00000000..ca9043bc --- /dev/null +++ b/main_coda.nf @@ -0,0 +1,40 @@ +// include { PROPR_PROPR } from '../../../modules/nf-core/propr/propr/main' +// include { PROPR_PROPD } from '../../../modules/nf-core/propr/propd/main' +// include { PROPR_GREA } from '../../../modules/nf-core/propr/grea/main' +// include { MYGENE } from '../../../modules/nf-core/mygene/main' +include { EXPERIMENTAL } from './subworkflows/experimental/main.nf' +include { fromSamplesheet } from 'plugin/nf-validation' + + +// These are local files from my Bachelor Thesis project, I am creating the ch_samples_and_matrix +// manually for testing but it should be be provided by the processing section of nf-core/differentialabundance +Counts_ch = Channel.fromPath("../YMC/counts_sin0.csv") + +Sample_ch = Channel.fromPath("../YMC/samplesheet_RCvsOX.csv") + .map{ it -> [[id: 'YMC'], it]} + +ch_samples_and_matrix = Sample_ch.combine(Counts_ch) + +// Convert the samplesheet.csv in a channel with the proper format +ch_tools = Channel.fromSamplesheet('tools') + + +// TO DO: This should be modified to run one path per default, not all +if (params.pathway == "all") { + ch_tools + .set{ ch_tools_single } +} else { + ch_tools + .filter{ + it[0]["pathway_name"] == params.pathway // TO DO: change pathway to path also in the tools_samplesheet file + } + //.view() + .set{ ch_tools_single } +} +ch_tools_single.view() + +workflow { + EXPERIMENTAL(ch_samples_and_matrix, ch_tools_single) + EXPERIMENTAL.out.output.view() +} + diff --git a/modules.json b/modules.json index ff141997..c4aa0ba5 100644 --- a/modules.json +++ b/modules.json @@ -60,6 +60,26 @@ "git_sha": "9326d73af3fbe2ee90d9ce0a737461a727c5118e", "installed_by": ["modules"] }, + "mygene": { + "branch": "master", + "git_sha": "82024cf6325d2ee194e7f056d841ecad2f6856e9", + "installed_by": ["modules"] + }, + "propr/grea": { + "branch": "master", + "git_sha": "71b1180a5a3de6398eb0eb4d55424cbda36f52d8", + "installed_by": ["modules"] + }, + "propr/propd": { + "branch": "master", + "git_sha": "08b360512467e8e7079f995bb7c981a7c204d00f", + "installed_by": ["modules"] + }, + "propr/propr": { + "branch": "master", + "git_sha": "132fa6c9bd2515807f6a1cdec1ad7d03c817bcc9", + "installed_by": ["modules"] + }, "proteus/readproteingroups": { "branch": "master", "git_sha": "a069b29783583c219c1f23ed3dcf64a5aee1340b", diff --git a/modules/local/filtervar/main.nf b/modules/local/filtervar/main.nf new file mode 100644 index 00000000..5d9c39c8 --- /dev/null +++ b/modules/local/filtervar/main.nf @@ -0,0 +1,23 @@ +process FILTERVAR { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-propr:5.0.3': + 'quay.io/biocontainers/r-propr:5.0.3' }" + + input: + tuple val(meta), path(count), path(adj_matrix) + + output: + tuple val(meta), path("*.count_filtered.tsv"), emit: count + path "*.R_sessionInfo.log", emit: session_info + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'filtervar.R' +} diff --git a/modules/local/filtervar/templates/filtervar.R b/modules/local/filtervar/templates/filtervar.R new file mode 100644 index 00000000..7b662626 --- /dev/null +++ b/modules/local/filtervar/templates/filtervar.R @@ -0,0 +1,275 @@ + + +#!/usr/bin/env Rscript + + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files +#' +#' @param file Input file +#' @param header Boolean. TRUE if first row is header. False without header. +#' @param row.names The first column is used as row names by default. +#' Otherwise, give another number. Or use NULL when no row.names are present. +#' +#' @return output Data frame +read_delim_flexible <- function(file, header = TRUE, row.names = 1, check.names = TRUE){ + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + mat <- read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) + + if ( (row.names == 'gene_id') & ('gene_name' %in% colnames(mat)) ){ + mat <- mat[, -which(colnames(mat) == 'gene_name')] + } else if ( (row.names == 'gene_name') & ('gene_id' %in% colnames(mat)) ){ + mat <- mat[, -which(colnames(mat) == 'gene_id')] + } + + return(mat) +} + +read_delim_flexible2 <- function(file, header = TRUE){ + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + mat <- read.delim( + file, + sep = separator, + header = header + ) + return(mat) +} + + + +################################################ +################################################ +## Parse arguments ## +################################################ +################################################ + +opt <- list( + count = '$count', + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.pathway_name', '$task.ext.prefix'), + transformation = 'clr', + reference = NA, + alpha = NA, + metric = 'pcor.bshrink', + permutation = 0, + cutoff_min = NA, + cutoff_max = NA, + cutoff_interval = NA, + ncores = as.integer('$task.cpus'), + features_id_col = 'gene_id', + fixseed = FALSE, + adjacency = FALSE, + fdrVal = 0.05, + adj_matrix = '$adj_matrix', + filterVar = 'yes' +) +opt_types <- list( + count = 'character', + prefix = 'character', + transformation = 'character', + reference = 'character', + alpha = 'numeric', + metric = 'character', + permutation = 'numeric', + cutoff_min = 'numeric', + cutoff_max = 'numeric', + cutoff_interval = 'numeric', + ncores = 'numeric', + features_id_col = 'character', + fixseed = 'logical', + adjacency = 'logical', + fdrVal = 'numeric', + adj_matrix = 'character', + filterVar = 'character' +) + + +# Apply parameter overrides +args_opt <- parse_args('$task.ext.args') + +for ( ao in names(args_opt)){ + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])){ + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + # set NA + if (args_opt[[ao]] %in% c('NA', NA, 'null')){ + args_opt[[ao]] <- NA + } + opt[[ao]] <- args_opt[[ao]] + } +} + +# Check if required parameters have been provided +required_opts <- c('count') +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] +if (length(missing) > 0){ + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} + +################################################ +################################################ +## Perform variable selection ## +################################################ +################################################ + +# read matrix +A <- read_delim_flexible( + opt\$adj_matrix, + header = TRUE, + row.names = 1, + check.names = TRUE +) + +count <- read_delim_flexible2( + opt\$count, + header = TRUE +) + +### Determine most differentially proportional genes + +# Set diagonal in A to 0 +diag(A) <- 0 + +# Sum values in adjacency and add as an extra column +per_gene_connection <- rowSums(A) + +A\$per_gene <- per_gene_connection + +A <- A[order(A\$per_gene, decreasing = TRUE),] + +# Define selection criteria + +max_gene_number <- ncol(count)*10 # 10x samples for technical reasons (pcor) + +#Calculate connection threshold +total_connections <- sum(per_gene_connection)/2 # 2 because the matrix is symmetric +possible_connections <- nrow(count)*(nrow(count)-1)/2 + +percentage_expected <- total_connections/possible_connections +connection_threshold <- percentage_expected * nrow(count) + +# Filter count matrix according to selected genes + +col_genes <- which(names(count) == opt\$features_id_col) + +if (opt\$filterVar == 'yes'){ + # select only differentially proportional genes + top_genes <- rownames(A[which(A\$per_gene > connection_threshold),]) + count_filtered <- count[count[,col_genes] %in% top_genes,] + warning("non differentially proportional genes were removed before correlation analysis") + +} else if (max_gene_number < nrow(count) & opt\$metric== 'pcor.bshrink'){ + # select the maximum number of genes to perform partial correlation + top_genes <- rownames(A[1:gene_number,]) + count_filtered <- count[count[,col_genes] %in% top_genes,] + warning("some genes were removed to perform partial correlation") + +}else{ + # no genes were removed + count_filtered <- count + warning("No genes were removed") +} + + +################################################ +################################################ +## Generate outputs ## +################################################ +################################################ + +write.table( + count_filtered, + file = paste0(opt\$prefix, '.count_filtered.tsv'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE +) + +################################################ +################################################ +## WARNINGS ## +################################################ +################################################ + +sink(paste0(opt\$prefix, ".warnings.log")) +print(warnings()) +sink() + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink(paste0(opt\$prefix, ".R_sessionInfo.log")) +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +propr.version <- as.character(packageVersion('propr')) + +writeLines( + c( + '"${task.process}":', + paste(' r-propr:', propr.version) + ), +'versions.yml') + diff --git a/modules/nf-core/mygene/environment.yml b/modules/nf-core/mygene/environment.yml new file mode 100644 index 00000000..45442c49 --- /dev/null +++ b/modules/nf-core/mygene/environment.yml @@ -0,0 +1,7 @@ +name: mygene +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::mygene=3.2.2 diff --git a/modules/nf-core/mygene/main.nf b/modules/nf-core/mygene/main.nf new file mode 100644 index 00000000..25a21d8f --- /dev/null +++ b/modules/nf-core/mygene/main.nf @@ -0,0 +1,23 @@ +process MYGENE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mygene:3.2.2--pyh5e36f6f_0': + 'biocontainers/mygene:3.2.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(gene_list) + + output: + tuple val(meta), path("*.gmt"), emit: gmt + tuple val(meta), path("*.tsv"), emit: tsv , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template "mygene.py" +} diff --git a/modules/nf-core/mygene/meta.yml b/modules/nf-core/mygene/meta.yml new file mode 100644 index 00000000..f7aaa455 --- /dev/null +++ b/modules/nf-core/mygene/meta.yml @@ -0,0 +1,54 @@ +name: "mygene" +description: Fetch the GO concepts for a list of genes +keywords: + - mygene + - go + - annotation +tools: + - "mygene": + description: "A python wrapper to query/retrieve gene annotation data from Mygene.info." + homepage: "https://mygene.info/" + documentation: "https://docs.mygene.info/projects/mygene-py/en/latest/" + tool_dev_url: "https://github.com/biothings/mygene.py" + doi: "10.1093/nar/gks1114" + licence: ["Apache-2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - gene_list: + type: file + description: A tsv/csv file that contains a list of gene ids in one of the columns. + By default, the column name should be "gene_id", but this can be changed + by using "--columname gene_id" in ext.args. + pattern: "*.{csv,tsv}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - gmt: + type: file + description: | + Each row contains the GO id, a description, and a list of gene ids. + pattern: "*.gmt" + - tsv: + type: file + description: | + (optional) A tsv file with the following columns: + query, mygene_id, go_id, go_term, go_evidence, go_category, symbol, name, taxid + pattern: "*.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@suzannejin" +maintainers: + - "@suzannejin" diff --git a/modules/nf-core/mygene/templates/mygene.py b/modules/nf-core/mygene/templates/mygene.py new file mode 100644 index 00000000..2717c467 --- /dev/null +++ b/modules/nf-core/mygene/templates/mygene.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +import argparse +import mygene +import shlex + + +""" +This python script uses the mygene module to query the MyGene.info API and +retrieve the go terms associated with a list of gene ids. The gene ids should +ideally be Ensembl or Entrez ids. The script generates two outputs: + 1. A tsv file containing information related to each query. The columns + include query, mygene_id, go_id, go_term, go_evidence, go_category, + symbol, name, and taxid. + 2. A gmt file containing information related to each go term. Each row + includes the go id, go term, and the genes associated with that go term. + +Author: Suzanne Jin +License: Apache 2.0 (same as the mygene library) +""" + + +class Arguments: + """ + Parses the argments, including the ones coming from $task.ext.args. + """ + + def __init__(self) -> None: + self.input = "$gene_list" + self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id" + self.output_gmt = self.prefix + ".gmt" + self.output_tsv = self.prefix + ".tsv" + self.parse_ext_args("$task.ext.args") + + def parse_ext_args(self, args_string: str) -> None: + """ + It parses the extended arguments. + """ + # skip when there are no extended arguments + if args_string == "null": + args_string = "" + + # Parse the extended arguments + args_list = shlex.split(args_string) # Split the string into a list of arguments + parser = argparse.ArgumentParser() + # input parameters + parser.add_argument( + "--columname", + default="gene_id", + help="Name of the column where the gene ids are stored in the input file. Default: gene_id", + ) + # filtering parameters + parser.add_argument( + "--species", + default=None, + help="Comma separated of common name of the species or taxon ids", + ) + parser.add_argument( + "--go_category", + default=None, + help="Comma separated list of GO categories to keep. Default: all", + ) + parser.add_argument( + "--go_evidence", + default=None, + help="Comma separated list of GO evidence codes to keep. Default: all", + ) + # additional parameters for querymany + parser.add_argument( + "--scopes", + default=None, + help="Comma separated list of scopes to search for.", + ) + parser.add_argument( + "--entrezonly", + default=False, + help="When true, the query returns only the hits with valid Entrez gene ids. Default: false.", + ) + parser.add_argument( + "--ensemblonly", + default=False, + help="When true, the query returns only the hits with valid Ensembl gene ids. Default: False", + ) + # output parameters + parser.add_argument( + "--generate_tsv", + default=False, + help="Also generate a tsv file with the gene based information. Default: False", + ) + args = parser.parse_args(args_list) + + # Convert "null" values to default values + # convert true to True and false to False + for attr in vars(args): + value = getattr(args, attr) + if value == "null": + setattr(args, attr, parser.get_default(attr)) + elif value == "true": + setattr(args, attr, True) + elif value == "false": + setattr(args, attr, False) + + # check if the arguments are valid + if args.go_category: + args.go_category = args.go_category.upper() + for category in args.go_category.split(","): + if category not in ["BP", "MF", "CC"]: + raise ValueError("The GO category should be one of BP, MF, or CC.") + if args.go_evidence: + args.go_evidence = args.go_evidence.upper() + + # Assign args attributes to self attributes + for attr in vars(args): + setattr(self, attr, getattr(args, attr)) + + def print_args(self) -> None: + """ + Print the arguments. + """ + for attr in vars(self): + print(f"{attr}: {getattr(self, attr)}") + + +class Version: + """ + Parse the versions of the modules used in the script. + """ + + @staticmethod + def get_versions(modules: list) -> dict: + """ + This function takes a list of modules and returns a dictionary with the + versions of each module. + """ + return {module.__name__: module.__version__ for module in modules} + + @staticmethod + def format_yaml_like(data: dict, indent: int = 0) -> str: + """ + Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + yaml_str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{Version.format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + + +class MyGene: + """ + This class will query the MyGene.info API and retrieve the go terms + associated with a list of gene ids. + + In concrete, if first queries the mygene API to get the mygene ids for each + of the query gene. Then, it queries for the annotations, and parses the go + terms all together with all the other information. + """ + + def __init__( + self, + query: list, + species: str, + scopes: str, + entrezonly: bool, + ensemblonly: bool, + go_category: str = None, + go_evidence: str = None, + ) -> None: + self.query = query + self.fields = "go,symbol,name,taxid" + self.species = species + self.scopes = scopes + self.entrezonly = entrezonly + self.ensemblonly = ensemblonly + self.go_category = go_category + self.go_evidence = go_evidence + self.mg = mygene.MyGeneInfo() + self.idmap = self.query2idmap() + print(f"fetched {len(self.idmap)} ids from {len(self.query)} queries") + + def query2idmap(self) -> dict: + """ + It returns a dictionary with the mygene ids as keys and the query ids as values. + """ + q = self.mg.querymany( + self.query, + scopes=self.scopes, + species=self.species, + entrezonly=self.entrezonly, + ensemblonly=self.ensemblonly, + returnall=True, + ) + return {dic["_id"]: dic["query"] for dic in q["out"] if "_id" in dic} + + def id2info(self) -> list: + """ + It returns a list of dictionaries with the info returned from getgenes for all the query ids. + Each dictionary contains the annotations for the corresponding query gene. + """ + return self.mg.getgenes(list(set(self.idmap)), fields=self.fields, species=self.species) + + def parse_go_based_info(self) -> dict: + """ + It queries the annotations for all query ids and then parses a go + centric dictionary. It is a dictionary of dictionaries with the + following format: {{go_id1: [go_term, gene1, gene2, ...]}, ...} + """ + info = {} + for dic in self.id2info(): + if "go" not in dic: + continue + if self.go_category: + dic["go"] = { + category: dic["go"][category] for category in self.go_category.split(",") if category in dic["go"] + } + for category, go_list in dic["go"].items(): + if not isinstance(go_list, list): + go_list = [go_list] + for go in go_list: + if (self.go_evidence) and (go["evidence"] not in self.go_evidence.split(",")): + continue + + if go["id"] not in info: + info[go["id"]] = [go["term"], self.idmap[dic["_id"]]] + else: + info[go["id"]].append(self.idmap[dic["_id"]]) + return info + + def parse_gene_based_info(self) -> dict: + """ + It queries the annotations for all query ids and then parses a go + centric dictionary. + + At the end it returns a dictionary {query gene: {}} of dictionaries + with the following keys: query, mygene_id, go_id, go_term, go_evidence, + go_category, symbol, name, taxid. + """ + info = {} + for dic in self.id2info(): + if "go" not in dic: + continue + if self.go_category: + dic["go"] = { + category: dic["go"][category] for category in self.go_category.split(",") if category in dic["go"] + } + for category, go_list in dic["go"].items(): + if not isinstance(go_list, list): + go_list = [go_list] + for go in go_list: + if (self.go_evidence) and (go["evidence"] not in self.go_evidence.split(",")): + continue + + current_info = { + "query": self.idmap[dic["_id"]], + "mygene_id": dic["_id"], + "go_id": go["id"], + "go_term": go["term"], + "go_evidence": go["evidence"], + "go_category": category, + "symbol": dic["symbol"], + "name": dic["name"], + "taxid": dic["taxid"], + } + info[self.idmap[dic["_id"]]] = current_info + return info + + def parse_and_save_to_gmt(self, filename: str) -> list: + """ + It parses and saves go centric information to a gmt file. + The final gmt output will be sorted following the go id order. + """ + info = self.parse_go_based_info() + info = dict(sorted(info.items(), key=lambda x: x[0])) + with open(filename, "w") as f: + for go_id, go_list in info.items(): + tmp = sorted(go_list[1:]) + f.write(go_id + "\\t" + go_list[0] + "\\t" + "\\t".join(tmp) + "\\n") + print(f"saved {len(info)} go terms to {filename}") + + def parse_and_save_to_tsv(self, filename: str) -> None: + """ + It parses and saves gene centric information in a tsv file. + The final tsv output will be sorted following the input query gene list order. + """ + info = self.parse_gene_based_info() + with open(filename, "w") as f: + f.write("\\t".join(info[self.query[0]].keys()) + "\\n") + for gene in self.query: # sorted by query gene list + if gene in info: + f.write("\\t".join([str(val) for val in info[gene].values()]) + "\\n") + print(f"saved {len(info)} gene centric info to {filename}") + + +def load_list(filename: str, columname: str) -> list: + """ + It loads the list of gene ids from a file. + The columname is the name of the column where the gene ids are stored. + """ + if filename.split(".")[-1] == "tsv": + sep = "\\t" + elif filename.split(".")[-1] == "csv": + sep = "," + else: + raise ValueError("The input file extension should be either tsv or csv.") + with open(filename, "r") as f: + idx = f.readline().strip().split(sep).index(columname) + return [line.strip().split(sep)[idx] for line in f] + + +if __name__ == "__main__": + # parse and print arguments + args = Arguments() + args.print_args() + + # load gene list + gene_list = load_list(args.input, args.columname) + + # run mygene api + mg = MyGene( + gene_list, + species=args.species, + scopes=args.scopes, + entrezonly=args.entrezonly, + ensemblonly=args.ensemblonly, + go_category=args.go_category, + go_evidence=args.go_evidence, + ) + + # parse annotations and save output files + mg.parse_and_save_to_gmt(args.output_gmt) + if args.generate_tsv: + mg.parse_and_save_to_tsv(args.output_tsv) + + # write versions to file + versions_this_module = {} + versions_this_module["${task.process}"] = Version.get_versions([argparse, mygene]) + with open("versions.yml", "w") as f: + f.write(Version.format_yaml_like(versions_this_module)) diff --git a/modules/nf-core/mygene/tests/default_tsv.config b/modules/nf-core/mygene/tests/default_tsv.config new file mode 100644 index 00000000..08bd6fac --- /dev/null +++ b/modules/nf-core/mygene/tests/default_tsv.config @@ -0,0 +1,3 @@ +process{ + ext.args = "--generate_tsv true" +} \ No newline at end of file diff --git a/modules/nf-core/mygene/tests/go_category.config b/modules/nf-core/mygene/tests/go_category.config new file mode 100644 index 00000000..771f8f4d --- /dev/null +++ b/modules/nf-core/mygene/tests/go_category.config @@ -0,0 +1,3 @@ +process { + ext.args = "--go_category bp,mf" +} \ No newline at end of file diff --git a/modules/nf-core/mygene/tests/go_evidence.config b/modules/nf-core/mygene/tests/go_evidence.config new file mode 100644 index 00000000..b19de214 --- /dev/null +++ b/modules/nf-core/mygene/tests/go_evidence.config @@ -0,0 +1,3 @@ +process { + ext.args = "--go_evidence EXP,IDA" +} \ No newline at end of file diff --git a/modules/nf-core/mygene/tests/main.nf.test b/modules/nf-core/mygene/tests/main.nf.test new file mode 100644 index 00000000..e5ba64ca --- /dev/null +++ b/modules/nf-core/mygene/tests/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process MYGENE" + script "../main.nf" + process "MYGENE" + + tag "modules" + tag "modules_nfcore" + tag "mygene" + + test("mygene - default options") { + + tag "default" + + when { + process { + """ + input[0] = [ + [id : 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.gene_meta.tsv") + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.gmt).match("mygene - default options - gmt") }, + { assert snapshot(process.out.versions).match("mygene - default options - versions") } + ) + } + } + + test("mygene - default with tsv file") { + + tag "default_with_tsv" + config "./default_tsv.config" + + when { + process { + """ + input[0] = [ + [id : 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.gene_meta.tsv") + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.gmt).match("mygene - default with tsv file - gmt") }, + { assert snapshot(process.out.tsv).match("mygene - default with tsv file - tsv") }, + { assert snapshot(process.out.versions).match("mygene - default with tsv file - versions") } + ) + } + } + + test("mygene - filter by go category") { + + tag "filter_by_go_category" + config "./go_category.config" + + when { + process { + """ + input[0] = [ + [id : 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.gene_meta.tsv") + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.gmt).match("mygene - filter by go category - gmt") }, + { assert snapshot(process.out.versions).match("mygene - filter by go category - versions") } + ) + } + } + + test("mygene - filter by go evidence") { + + tag "filter_by_go_evidence" + config "./go_evidence.config" + + when { + process { + """ + input[0] = [ + [id : 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.gene_meta.tsv") + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.gmt).match("mygene - filter by go evidence - gmt") }, + { assert snapshot(process.out.versions).match("mygene - filter by go evidence - versions") } + ) + } + } +} diff --git a/modules/nf-core/mygene/tests/main.nf.test.snap b/modules/nf-core/mygene/tests/main.nf.test.snap new file mode 100644 index 00000000..d6a334c7 --- /dev/null +++ b/modules/nf-core/mygene/tests/main.nf.test.snap @@ -0,0 +1,135 @@ +{ + "mygene - filter by go evidence - versions": { + "content": [ + [ + "versions.yml:md5,09d72645c3ae7e886af6e8bd2876c72b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:20:31.854823" + }, + "mygene - default options - versions": { + "content": [ + [ + "versions.yml:md5,09d72645c3ae7e886af6e8bd2876c72b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:19:43.081388" + }, + "mygene - default with tsv file - versions": { + "content": [ + [ + "versions.yml:md5,09d72645c3ae7e886af6e8bd2876c72b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:20:01.837699" + }, + "mygene - default options - gmt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gmt:md5,d76d4d06dad199c5e3ecef7060876834" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:19:43.060437" + }, + "mygene - filter by go category - versions": { + "content": [ + [ + "versions.yml:md5,09d72645c3ae7e886af6e8bd2876c72b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:20:17.233994" + }, + "mygene - filter by go evidence - gmt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gmt:md5,da6b31a5f889e3aedb16b4154f9652af" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:20:31.827798" + }, + "mygene - default with tsv file - tsv": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tsv:md5,018e23173b224cbf328751006593900e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:20:01.81872" + }, + "mygene - default with tsv file - gmt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gmt:md5,d76d4d06dad199c5e3ecef7060876834" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:20:01.79811" + }, + "mygene - filter by go category - gmt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gmt:md5,213c1d1d2345df8ea51d67cb1670f4f7" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T17:20:17.208509" + } +} \ No newline at end of file diff --git a/modules/nf-core/mygene/tests/tags.yml b/modules/nf-core/mygene/tests/tags.yml new file mode 100644 index 00000000..c867c978 --- /dev/null +++ b/modules/nf-core/mygene/tests/tags.yml @@ -0,0 +1,2 @@ +mygene: + - "modules/nf-core/mygene/**" diff --git a/modules/nf-core/propr/grea/environment.yml b/modules/nf-core/propr/grea/environment.yml new file mode 100644 index 00000000..c6897c73 --- /dev/null +++ b/modules/nf-core/propr/grea/environment.yml @@ -0,0 +1,7 @@ +name: propr_grea +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::r-propr=5.0.4 diff --git a/modules/nf-core/propr/grea/main.nf b/modules/nf-core/propr/grea/main.nf new file mode 100644 index 00000000..d2e1ee6d --- /dev/null +++ b/modules/nf-core/propr/grea/main.nf @@ -0,0 +1,24 @@ +process PROPR_GREA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-propr:5.0.4': + 'biocontainers/r-propr:5.0.4' }" + + input: + tuple val(meta), path(adj) + tuple val(meta2), path(gmt) + + output: + tuple val(meta), path("*.go.tsv"), emit: enrichedGO + path "versions.yml", emit: versions + path "*.R_sessionInfo.log", emit: session_info + + when: + task.ext.when == null || task.ext.when + + script: + template 'grea.R' +} diff --git a/modules/nf-core/propr/grea/meta.yml b/modules/nf-core/propr/grea/meta.yml new file mode 100644 index 00000000..de40abd3 --- /dev/null +++ b/modules/nf-core/propr/grea/meta.yml @@ -0,0 +1,57 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "propr_grea" +description: Perform Gene Ratio Enrichment Analysis +keywords: + - logratio + - differential + - propr + - grea + - enrichment + - expression +tools: + - "grea": + description: "Gene Ratio Enrichment Analysis" + homepage: "https://github.com/tpq/propr" + documentation: "https://rdrr.io/cran/propr/man/propr.html" + tool_dev_url: "https://github.com/tpq/propr" + doi: "10.2202/1544-6115.1175" + licence: ["GPL-2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + This can be used at the workflow level to pass optional parameters to the module. + [id: 'test', ...] + - meta2: + type: map + description: | + Groovy map containing study-wide metadata related to the knowledge database + - adj: + type: file + description: adjacency matrix for gene ratio proportionality/differential proportionality + pattern: "*.{csv,tsv}" + - gmt: + type: file + description: relational database containing genes and GO terms (generated by mygene module) + pattern: "*.{gmt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information. + This can be used at the workflow level to pass optional parameters to the module. + [id: 'test', ...] + - enrichedGO: + type: file + description: File containing GO terms and their enrichment values + pattern: "*.{csv}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@caraiz2001" +maintainers: + - "@caraiz2001" diff --git a/modules/nf-core/propr/grea/templates/grea.R b/modules/nf-core/propr/grea/templates/grea.R new file mode 100644 index 00000000..2d568b70 --- /dev/null +++ b/modules/nf-core/propr/grea/templates/grea.R @@ -0,0 +1,253 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files (determined by file extension) +#' +#' @param file Input file +#' @param header Boolean. TRUE if first row is header. False without header. +#' @param row.names The first column is used as row names by default. +#' Otherwise, give another number. Or use NULL when no row.names are present. +#' +#' @return output Data frame +read_delim_flexible <- function(file, header = TRUE, row.names = 1, check.names = TRUE){ + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) # Get the file extension + + if (ext == "tsv" || ext == "txt") { # If the file is a tsv or txt file + separator <- "\\t" # Set the separator variable to tab + } else if (ext == "csv") { # If the file is a csv file + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + mat <- read.delim( # Read the file + file, + sep = separator, # Set the separator defined above + header = header, + row.names = row.names, + check.names = check.names + ) +} + +#' Converts the .gmt file into a df +#' +#' @param file_gmt_path path of the .gmt file provided by mygene module. +#' @return output dataframe a Dataframe: 1st column = GOterm, 2nd = Description, 3d to end = genes. +process_gmt_file <- function(file_gmt_path) { + + lines <- readLines(file_gmt_path) + data_list <- list() + + for (line in lines) { + fields <- strsplit(line, "\\t")[[1]] # Split the line based on the tab character + go_term <- fields[1] # Extract the GO term + + # Create a data frame with the GO term in the first column + # Fill in missing values with NA to ensure consistent column lengths + data_list[[go_term]] <- data.frame(GOterm = go_term, + Description = fields[2], + GeneIDs = c(fields[3:length(fields)], rep(NA, max(0, 3 - length(fields))))) + } + + gmt_df <- do.call(rbind, data_list) # Combine all data frames into a single data frame + gmt_df\$GeneIDs <- as.character(gmt_df\$GeneIDs) # Convert gene IDs to character to avoid coercion + + return(gmt_df) +} + +#' Converts the .gmt data frame into a knowledge matrix (contingency table) +#' +#' @param gmt_df .gmt df created by process_gmt_file +#' @return output dataframe. A knowledge database where each row is a graph node (gene) +#' and each column is a concept (GO term). +gmt_to_K<- function(gmt_df){ + + summ_df <- as.data.frame(gmt_df\$GeneIDs) + summ_df <- cbind(summ_df, as.data.frame(gmt_df\$GOterm)) + colnames(summ_df)<- c("GeneIDs", "GOterm") + summ_df<- unique(summ_df) + + summ_df\$value <- 1 + + K <- table(summ_df\$GeneIDs, summ_df\$GOterm) + K <- as.data.frame.matrix(K) + + return(K) +} + +#' Expands knowledge matrix with missing genes to ensure same number of rows for A and K +#' +#' @param adjacency_matrix gene x gene correlation or proportionality adjacency matrix (output propr/propd) +#' @return output dataframe. A knowledge database where each row is a graph node (gene) +#' and each column is a concept (GO term). +add_missing <- function(adjacency_matrix, knowledge_matrix){ + + missing_genes <- setdiff(rownames(adjacency_matrix), rownames(knowledge_matrix)) + extra_rows <- data.frame(matrix(0, nrow = length(missing_genes), ncol = ncol(knowledge_matrix))) + rownames(extra_rows) <- missing_genes + colnames(extra_rows) <- colnames(knowledge_matrix) + + knowledge_matrix <- rbind(knowledge_matrix, extra_rows) + return(knowledge_matrix) +} + +################################################ +################################################ +## Parse arguments ## +################################################ +################################################ + +opt <- list( + adj = '$adj', + gmt = '$gmt', + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), + permutation = 100, + fixseed = TRUE, + ncores = as.integer('$task.cpus') +) + +opt_types <- list( + adj = 'character', + gmt = 'character', + prefix = 'character', + permutation = 'numeric', + fixseed = 'logical', + ncores = 'numeric' +) + +# Apply parameter overrides +args_opt <- parse_args('$task.ext.args') + +for ( ao in names(args_opt)){ + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])){ + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + # set NA + if (args_opt[[ao]] %in% c('NA', NA, 'null')){ + args_opt[[ao]] <- NA + } + opt[[ao]] <- args_opt[[ao]] + } +} + +# Check if required parameters have been provided +required_opts <- c('adj', 'gmt') # defines a vector required_opts containing the names of the required parameters. +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] +if (length(missing) > 0){ + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} + + +# Check file inputs are valid +for (file_input in c('adj', 'gmt')){ + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + if (! file.exists(opt[[file_input]])){ + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } +} + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ + +library(propr) + +################################################ +################################################ +## Enrichment analysis ## +################################################ +################################################ + +# Read gene x gene adjacency matrix +A <- read_delim_flexible(opt\$adj, header = TRUE, row.names = 1, check.names = TRUE) + +# Read and process gene x GO term matrix +gmt_df <- process_gmt_file(opt\$gmt) +K <- gmt_to_K(gmt_df) + +# Ensure same number of rows (genes) +if (nrow(A) != nrow(K)){ + K <- add_missing(A, K) +} + +# Run Graflex +G <- runGraflex(A, K, opt\$permutation, opt\$fixseed) + +################################################ +################################################ +## Generate outputs ## +################################################ +################################################ + +write.table( + G, + file = paste0(opt\$prefix, '.go.tsv'), + col.names = TRUE, + row.names = TRUE, + sep = '\\t', + quote = FALSE + +) + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink(paste0(opt\$prefix, ".R_sessionInfo.log")) +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +propr.version <- as.character(packageVersion('propr')) + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' r-propr:', propr.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/nf-core/propr/grea/tests/grea_test.config b/modules/nf-core/propr/grea/tests/grea_test.config new file mode 100644 index 00000000..8d0d229a --- /dev/null +++ b/modules/nf-core/propr/grea/tests/grea_test.config @@ -0,0 +1,8 @@ +process { + withName: "PROPR_PROPR"{ + ext.args = { "--adjacency true --permutation 5 --fixseed true --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.05"} + } + withName: "PROPR_GREA"{ + ext.args = { "--permutation 5 --fixseed true"} + } +} \ No newline at end of file diff --git a/modules/nf-core/propr/grea/tests/main.nf.test b/modules/nf-core/propr/grea/tests/main.nf.test new file mode 100644 index 00000000..afac1dec --- /dev/null +++ b/modules/nf-core/propr/grea/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process PROPR_GREA" + script "../main.nf" + process "PROPR_GREA" + + tag "modules" + tag "modules_nfcore" + tag "propr" + tag "propr/grea" + tag "mygene" + tag "propr/propr" + + test("grea chained to propr using default options") { + + tag "default" + config "./grea_test.config" + + setup { + run("PROPR_PROPR") { + script "../../propr/main.nf" + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + """ + } + } + run("MYGENE") { + script "../../../mygene/main.nf" + process { + """ + input[0] = [ + [id : 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.gene_meta.tsv") + ] + """ + } + } + } + + when { + process { + """ + input[0] = PROPR_PROPR.out.adj.collect{ meta, adj -> adj }.map{ adj -> [[ id: 'test_adj'], adj]} + input[1] = MYGENE.out.gmt.collect{ meta, gmt -> gmt }.map{ gmt -> [[ id: 'test_gmt'], gmt]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.enrichedGO).match("grea chained to propr using default options - enrichedGO") }, + { assert snapshot(process.out.versions).match("versions") } + + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/propr/grea/tests/main.nf.test.snap b/modules/nf-core/propr/grea/tests/main.nf.test.snap new file mode 100644 index 00000000..a915603d --- /dev/null +++ b/modules/nf-core/propr/grea/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,222a7a8b79b5a2987637279847c609d1" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-29T10:45:07.582509" + }, + "grea chained to propr using default options - enrichedGO": { + "content": [ + [ + [ + { + "id": "test_adj" + }, + "test_adj.go.tsv:md5,914d8b750ba303a297efb7331ec238b7" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-29T10:45:07.508934" + } +} \ No newline at end of file diff --git a/modules/nf-core/propr/grea/tests/tags.yml b/modules/nf-core/propr/grea/tests/tags.yml new file mode 100644 index 00000000..e7f80baf --- /dev/null +++ b/modules/nf-core/propr/grea/tests/tags.yml @@ -0,0 +1,2 @@ +propr/grea: + - "modules/nf-core/propr/grea/**" diff --git a/modules/nf-core/propr/propd/environment.yml b/modules/nf-core/propr/propd/environment.yml new file mode 100644 index 00000000..058f30a0 --- /dev/null +++ b/modules/nf-core/propr/propd/environment.yml @@ -0,0 +1,7 @@ +name: propr_propd +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::r-propr=5.0.3 diff --git a/modules/nf-core/propr/propd/main.nf b/modules/nf-core/propr/propd/main.nf new file mode 100644 index 00000000..ba7727d0 --- /dev/null +++ b/modules/nf-core/propr/propd/main.nf @@ -0,0 +1,28 @@ +process PROPR_PROPD { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-propr:5.0.3': + 'biocontainers/r-propr:5.0.3' }" + + input: + tuple val(meta), path(count) + tuple val(meta2), path(samplesheet) + + output: + tuple val(meta), path("*.propd.rds"), emit: propd + tuple val(meta), path("*.propd.tsv"), emit: results + tuple val(meta), path("*.fdr.tsv") , emit: fdr , optional:true + tuple val(meta), path("*.adj.csv"), emit: adj , optional:true + path "*.warnings.log", emit: warnings + path "*.R_sessionInfo.log" , emit: session_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'propd.R' +} diff --git a/modules/nf-core/propr/propd/meta.yml b/modules/nf-core/propr/propd/meta.yml new file mode 100644 index 00000000..7dcb4625 --- /dev/null +++ b/modules/nf-core/propr/propd/meta.yml @@ -0,0 +1,76 @@ +name: "propr_propd" +description: Perform differential proportionality analysis +keywords: + - differential + - proportionality + - logratio + - expression + - propr +tools: + - "propr": + description: "Logratio methods for omics data" + homepage: "https://github.com/tpq/propr" + documentation: "https://rdrr.io/cran/propr/man/propr.html" + tool_dev_url: "https://github.com/tpq/propr" + doi: "10.1038/s41598-017-16520-0" + licence: ["GPL-2"] +input: + - meta: + type: map + description: | + Groovy Map containing additional information. + This can be used at the workflow level to pass optional parameters to the module. + [id: 'test', ...] + - count: + type: file + description: | + Count matrix, where rows = variables or genes, columns = samples or cells. + This matrix should not contain zeros. One should plug this module after another one that handles the zeros. + pattern: "*.{csv,tsv}" + - meta2: + type: map + description: | + Groovy map containing study-wide metadata related to the sample sheet and matrix + - samplesheet: + type: file + description: | + CSV or TSV format sample sheet with sample metadata +output: + - meta: + type: map + description: | + Groovy Map containing additional information. + This can be used at the workflow level to pass optional parameters to the module. + [id: 'test', ...] + - propd: + type: file + description: R propd object + pattern: "*.propd.rds" + - results: + type: file + description: Results table + pattern: "*.propd.tsv" + - fdr: + type: file + description: (optional) propd fdr table + pattern: "*.fdr.tsv" + - adj: + type: file + description: (optional) propd adj table + pattern: "*.adj.csv" + - warnings: + type: file + description: propd warnings + pattern: "*.warnings.txt" + - session_info: + type: file + description: dump of R SessionInfo + pattern: "*.R_sessionInfo.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@suzannejin" +maintainers: + - "@suzannejin" diff --git a/modules/nf-core/propr/propd/templates/propd.R b/modules/nf-core/propr/propd/templates/propd.R new file mode 100644 index 00000000..63dbed49 --- /dev/null +++ b/modules/nf-core/propr/propd/templates/propd.R @@ -0,0 +1,372 @@ +#!/usr/bin/env Rscript + + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files +#' +#' @param file Input file +#' @param header Boolean. TRUE if first row is header. False without header. +#' @param row.names The first column is used as row names by default. +#' Otherwise, give another number. Or use NULL when no row.names are present. +#' +#' @return output Data frame +read_delim_flexible <- function(file, header = TRUE, row.names = 1, check.names = TRUE){ + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + mat <- read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) + + if (!is.null(row.names)){ + if ( (row.names == 'gene_id') & ('gene_name' %in% colnames(mat)) ){ + mat <- mat[, -which(colnames(mat) == 'gene_name')] + } else if ( (row.names == 'gene_name') & ('gene_id' %in% colnames(mat)) ){ + mat <- mat[, -which(colnames(mat) == 'gene_id')] + } + } + + return(mat) +} + +#' Extract the values for a single metric and convert it into a genes x genes matrix. +#' +#' @param object propd object +one_metric_df <- function(object){ + results <- getResults(object) + #keep only the metric of interest + one_metric <- cbind(results\$Partner, results\$Pair, results\$theta) + colnames(one_metric) <- c("Partner", "Pair", "theta") + one_metric <- as.data.frame(one_metric) + + # Extract the unique gene names + gene_names <- sort(unique(c(one_metric\$Partner, one_metric\$Pair))) + # Initialize a square matrix with NA + square_matrix <- matrix(NA, nrow = length(gene_names), ncol = length(gene_names)) + rownames(square_matrix) <- gene_names + colnames(square_matrix) <- gene_names + + # Use the `match` function to get the row and column indices + row_indices <- match(one_metric\$Partner, gene_names) + col_indices <- match(one_metric\$Pair, gene_names) + # Use these indices to populate the matrix + square_matrix[cbind(row_indices, col_indices)] <- one_metric[["theta"]] + # Populate the reverse pairs to ensure symmetry + square_matrix[cbind(col_indices, row_indices)] <- one_metric[["theta"]] + return(square_matrix) +} + +#' Extract the differential proportionality cutoff for a specified FDR value. +#' Gene pairs with a value higher than the extracted cutoff will be considered significantly differentially proportional. +#' +#' @param object propd object. Output from propd function. updateCutoffs function should be applied to the object previous to valCutoff. +#' @param fdrVal FDR value to extract the cutoff for. Per default 0.05. +#' +#' @return cutoff value. Differential proportionality values lower than this cutoff are considered significant. +valCutoff <- function(object, fdrVal = 0.05){ + fdr_df <- object@fdr + if (prod(dim(fdr_df) == 0)){ + warning("Please run updateCutoff on propd first") + }else{ + fdr_vals <- fdr_df\$FDR + if (any(!is.na(fdr_vals))){ # Si hay algun valor de FDR correcto + threshold <- any(fdr_vals <= fdrVal) + if (threshold){ + fdr_threshold <- fdr_vals[which.min(fdr_vals <= fdrVal) - 1] + }else{ + warning("FDR is higher than the specified threshold for all proportionality values. Using the lowest fdr instead") + fdr_threshold <- fdr_vals[1] + } + }else{ + stop("No true counts in the given interval. FDR values are not defined") + geterrmessage() + } + } + cutoff <- fdr_df\$cutoff[fdr_df\$FDR == fdr_threshold] + return(cutoff) +} + +#' Convert a proportionality matrix to an adjacency matrix based on a threshold. +#' +#' @param matrix proportionality matrix. Can be extracted from propr object with getMatrix(). +#' @param cutoff Significant proportionality value extracted from valCutoff function. +#' +#' @return Adjacency matrix. Gene pairs with a proportionality value lower than the threshold will have 1, otherwise 0. +convert_to_adjacency <- function(matrix, cutoff) { + adjacency <- ifelse(matrix < cutoff, 1, 0) + return(adjacency) +} + +################################################ +################################################ +## Parse arguments ## +################################################ +################################################ + +opt <- list( + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), + count = '$count', + samplesheet = '$samplesheet', + features_id_col = 'gene_id', # column name of feature ids + obs_id_col = 'sample', # column name of observation ids + group_col = 'treatment', # column name of grouping variable + metric = 'theta_d', # differential proportionality metric: theta_d, theta_e or theta_f + alpha = NA, # alpha for boxcox transformation + permutation = 0, # permutation cycles for computing FDR + cutoff_min = NA, # minimun threshold to test + cutoff_max = NA, # maximun threshold to test + cutoff_interval = NA, # interval between thresholds + fixseed = FALSE, + adjacency = FALSE, + fdrVal = 0.05, + ncores = as.integer('$task.cpus') +) +opt_types <- list( + prefix = 'character', + count = 'character', + samplesheet = 'character', + features_id_col = 'character', + obs_id_col = 'character', + group_col = 'character', + metric = 'character', + alpha = 'numeric', + permutation = 'numeric', + cutoff_min = 'numeric', + cutoff_max = 'numeric', + cutoff_interval = 'numeric', + fixseed = 'logical', + adjacency = 'logical', + fdrVal = 'numeric', + ncores = 'numeric' +) + +# Apply parameter overrides +args_opt <- parse_args('$task.ext.args') +for ( ao in names(args_opt)){ + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])){ + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + # set NA + if (args_opt[[ao]] %in% c('NA', NA, 'null')){ + args_opt[[ao]] <- NA + } + opt[[ao]] <- args_opt[[ao]] + } +} + +# Check if required parameters have been provided +required_opts <- c('count','samplesheet') +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] +if (length(missing) > 0){ + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} + +# Check file inputs are valid +for (file_input in c('count','samplesheet')){ + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + if (! file.exists(opt[[file_input]])){ + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } +} + +# check parameters +if (! opt\$metric %in% c('theta_d', 'theta_e', 'theta_f')) stop('Please provide a valid differential proportionality metric') + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ + +library(propr) + +################################################ +################################################ +## Perform differential proportionality ## +################################################ +################################################ + +# read matrix +mat <- read_delim_flexible( + opt\$count, + header = TRUE, + row.names = opt\$features_id_col, + check.names = FALSE +) +mat <- t(mat) + +# check zeros +# log transformation should be applied on non-zero data +# otherwise Inf values are generated +if (any(mat == 0)) print("Zeros will be replaced by minimun value before logratio analysis") + +# parse group +# this creates a vector referring to the group id for each observation +samplesheet <- read_delim_flexible( + opt\$samplesheet, + header = TRUE, + row.names = NULL, + check.names = FALSE +) +tmp <- samplesheet[[opt\$group_col]] +names(tmp) <- samplesheet[[opt\$obs_id_col]] +group <- as.vector(tmp[rownames(mat)]) +if (length(group) != nrow(mat)) stop('Error when parsing group') + +# perform differential proportionality +pd <- propd( + mat, + group = group, + alpha = opt\$alpha, + weighted = FALSE, + p = opt\$permutation, + fixseed = opt\$fixseed +) + +if (opt\$metric == 'theta_d'){ + pd <- setDisjointed(pd) +} else if (opt\$metric == 'theta_e'){ + pd <- setEmergent(pd) +} else if (opt\$metric == 'theta_f'){ + pd <- setActive(pd, what = "theta_f") +} + +# update FDR by permutation, if required +if (opt\$permutation > 0) { + cutoff <- seq( + opt\$cutoff_min, + opt\$cutoff_max, + opt\$cutoff_interval + ) + pd <- updateCutoffs(pd, cutoff=cutoff, ncores=opt\$ncores) + if (opt\$metric == 'theta_d') pd <- updateF(pd) +} + +# Extract adjacency matrix if required +if (opt\$adjacency == TRUE) { + matrix <- one_metric_df(pd) + cutoff <- valCutoff(pd, opt\$fdrVal) + adj <- convert_to_adjacency(matrix, cutoff) +} + +################################################ +################################################ +## Generate outputs ## +################################################ +################################################ + +saveRDS( + pd, + file = paste0(opt\$prefix, '.propd.rds') +) + +write.table( + getResults(pd), + file = paste0(opt\$prefix, '.propd.tsv'), + col.names = TRUE, + row.names = FALSE, + sep = '\\t', + quote = FALSE +) + +if (opt\$permutation > 0) { + write.table( + pd@fdr, + file = paste0(opt\$prefix, '.fdr.tsv'), + col.names = TRUE, + sep = '\\t', + quote = FALSE + ) +} + +if (opt\$adjacency == TRUE) { + write.table( + adj, + file = paste0(opt\$prefix, '.adj.csv'), + col.names = TRUE, + row.names = TRUE, + sep = ',', + quote = FALSE + ) +} + +################################################ +################################################ +## WARNINGS ## +################################################ +################################################ + +sink(paste0(opt\$prefix, ".warnings.log")) +print(warnings()) +sink() + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink(paste0(opt\$prefix, ".R_sessionInfo.log")) +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +propr.version <- as.character(packageVersion('propr')) + +writeLines( + c( + '"${task.process}":', + paste(' r-propr:', propr.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/nf-core/propr/propd/tests/adjacency.config b/modules/nf-core/propr/propd/tests/adjacency.config new file mode 100644 index 00000000..072a4d75 --- /dev/null +++ b/modules/nf-core/propr/propd/tests/adjacency.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.1 --fixseed true --adjacency true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propd/tests/boxcox_theta_e.config b/modules/nf-core/propr/propd/tests/boxcox_theta_e.config new file mode 100755 index 00000000..40c0548d --- /dev/null +++ b/modules/nf-core/propr/propd/tests/boxcox_theta_e.config @@ -0,0 +1,4 @@ +process { + ext.args = {"--metric theta_e --alpha 0.2 --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.05 --fixseed true"} + ext.prefix = {"test+theta_e+0.2"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propd/tests/default_boxcox.config b/modules/nf-core/propr/propd/tests/default_boxcox.config new file mode 100755 index 00000000..831002d9 --- /dev/null +++ b/modules/nf-core/propr/propd/tests/default_boxcox.config @@ -0,0 +1,4 @@ +process { + ext.args = {"--alpha 0.2 --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.05 --fixseed true"} + ext.prefix = {"test+theta_d+0.2"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propd/tests/default_permutation.config b/modules/nf-core/propr/propd/tests/default_permutation.config new file mode 100755 index 00000000..e89c239f --- /dev/null +++ b/modules/nf-core/propr/propd/tests/default_permutation.config @@ -0,0 +1,4 @@ +process { + ext.args = {"--permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.05 --fixseed true"} + ext.prefix = {"test+theta_d+NA"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propd/tests/main.nf.test b/modules/nf-core/propr/propd/tests/main.nf.test new file mode 100755 index 00000000..9fcaf93a --- /dev/null +++ b/modules/nf-core/propr/propd/tests/main.nf.test @@ -0,0 +1,154 @@ +nextflow_process { + + name "Test Process PROPR_PROPD" + script "../main.nf" + process "PROPR_PROPD" + + tag "modules" + tag "modules_nfcore" + tag "propr" + tag "propr/propd" + + test("Test propr/propd using default permutation") { + + tag "default" + config "./default_permutation.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + input[1] = [ + [ id: 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.results).match("Test propr/propd using default permutation - results") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("Test propr/propd using default boxcox permutation") { + + tag "default_boxcox" + config "./default_boxcox.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + input[1] = [ + [ id: 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.results).match(" Test propr/propd using default boxcox permutation - results") }, + { assert snapshot(process.out.fdr).match(" Test propr/propd using default boxcox permutation - fdr") } + ) + } + } + + test("Test propr/propd using theta_e permutation") { + + tag "theta_e" + config "./theta_e.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + input[1] = [ + [ id: 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.results).match("Test propr/propd using theta_e permutation - results") } + ) + } + } + + test("Test propr/propd using theta_e and boxcox permutation") { + + tag "boxcox_theta_e" + config "./boxcox_theta_e.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + input[1] = [ + [ id: 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.results).match("Test propr/propd using theta_e and boxcox permutation - results") } + ) + } + } + + test("Test propr/propd with adjacency matrix") { + + tag "adjacency" + config "./adjacency.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + input[1] = [ + [ id: 'test'], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.adj).match("Test propr/propd with adjacency matrix - adj") }, + { assert snapshot(process.out.results).match(" - results") } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/propr/propd/tests/main.nf.test.snap b/modules/nf-core/propr/propd/tests/main.nf.test.snap new file mode 100644 index 00000000..e0291044 --- /dev/null +++ b/modules/nf-core/propr/propd/tests/main.nf.test.snap @@ -0,0 +1,133 @@ +{ + " Test propr/propd using default boxcox permutation - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test+theta_d+0.2.fdr.tsv:md5,17e1c382e5f8275e2858a86e98c1aa6c" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:10:06.778954" + }, + "Test propr/propd using theta_e and boxcox permutation - results": { + "content": [ + [ + [ + { + "id": "test" + }, + "test+theta_e+0.2.propd.tsv:md5,d56fcc7c8ae0b0853ea9ca6ac6484a08" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:13:35.158486" + }, + "Test propr/propd using theta_e permutation - results": { + "content": [ + [ + [ + { + "id": "test" + }, + "test+theta_e+NA.propd.tsv:md5,c190d80c11ba99a0303a8dd5ab8ed76f" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:12:03.500722" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,b41d17751970fc8bcf4f8e0326d239e2" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:07:07.588326" + }, + " - results": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propd.tsv:md5,34fda117492faf9a60f5807f56c4be68" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:16:44.428551" + }, + " Test propr/propd using default boxcox permutation - results": { + "content": [ + [ + [ + { + "id": "test" + }, + "test+theta_d+0.2.propd.tsv:md5,f1886c538e6aeed1bbac4c8c1ef0c930" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:10:02.25738" + }, + "Test propr/propd using default permutation - results": { + "content": [ + [ + [ + { + "id": "test" + }, + "test+theta_d+NA.propd.tsv:md5,34fda117492faf9a60f5807f56c4be68" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:07:04.720183" + }, + "Test propr/propd with adjacency matrix - adj": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.adj.csv:md5,9da907136fba72b0e098c7fbacbeb837" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-23T13:16:38.527389" + } +} \ No newline at end of file diff --git a/modules/nf-core/propr/propd/tests/tags.yml b/modules/nf-core/propr/propd/tests/tags.yml new file mode 100755 index 00000000..ba65ca0a --- /dev/null +++ b/modules/nf-core/propr/propd/tests/tags.yml @@ -0,0 +1,2 @@ +propr/propd: + - "modules/nf-core/propr/propd/**" diff --git a/modules/nf-core/propr/propd/tests/theta_e.config b/modules/nf-core/propr/propd/tests/theta_e.config new file mode 100755 index 00000000..37c0dd5a --- /dev/null +++ b/modules/nf-core/propr/propd/tests/theta_e.config @@ -0,0 +1,4 @@ +process { + ext.args = {"--metric theta_e --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.05 --fixseed true"} + ext.prefix = {"test+theta_e+NA"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/environment.yml b/modules/nf-core/propr/propr/environment.yml new file mode 100644 index 00000000..cb163068 --- /dev/null +++ b/modules/nf-core/propr/propr/environment.yml @@ -0,0 +1,7 @@ +name: propr_propr +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::r-propr=5.0.3 diff --git a/modules/nf-core/propr/propr/main.nf b/modules/nf-core/propr/propr/main.nf new file mode 100644 index 00000000..111f6d58 --- /dev/null +++ b/modules/nf-core/propr/propr/main.nf @@ -0,0 +1,27 @@ +process PROPR_PROPR { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-propr:5.0.3': + 'biocontainers/r-propr:5.0.3' }" + + input: + tuple val(meta), path(count) + + output: + tuple val(meta), path("*.propr.rds"), emit: propr + tuple val(meta), path("*.propr.tsv"), emit: matrix + tuple val(meta), path("*.fdr.tsv"), emit: fdr , optional:true + tuple val(meta), path("*.adj.csv"), emit: adj , optional:true + path "*.warnings.log", emit: warnings + path "*.R_sessionInfo.log", emit: session_info + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'propr.R' +} diff --git a/modules/nf-core/propr/propr/meta.yml b/modules/nf-core/propr/propr/meta.yml new file mode 100644 index 00000000..c2c9a11d --- /dev/null +++ b/modules/nf-core/propr/propr/meta.yml @@ -0,0 +1,76 @@ +name: "propr_propr" +description: | + Perform logratio-based correlation analysis -> get proportionality & basis shrinkage partial correlation coefficients. + One can also compute standard correlation coefficients, if required. +keywords: + - coexpression + - correlation + - proportionality + - logratio + - propr + - corpcor + +tools: + - "propr": + description: "Logratio methods for omics data" + homepage: "https://github.com/tpq/propr" + documentation: "https://rdrr.io/cran/propr/man/propr.html" + tool_dev_url: "https://github.com/tpq/propr" + doi: "10.1038/s41598-017-16520-0" + licence: ["GPL-2"] + - "corpcor": + description: "Efficient Estimation of Covariance and (Partial) Correlation" + homepage: "https://cran.r-project.org/web/packages/corpcor/index.html" + documentation: "https://cran.r-project.org/web/packages/corpcor/corpcor.pdf" + doi: "10.2202/1544-6115.1175" + licence: ["GPL >=3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + This can be used at the workflow level to pass optional parameters to the module. + [id: 'test', ...] + - count: + type: file + description: | + Count matrix, where rows = variables or genes, columns = samples or cells. + This matrix should not contain zeros. Otherwise, they will be replaced by the minimun number. + It is recommended to handle the zeros beforehand with the method of preference. + pattern: "*.{csv,tsv}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information. + This can be used at the workflow level to pass optional parameters to the module. + [id: 'test', ...] + - propr: + type: file + description: R propr object + pattern: "*.propr.rds" + - matrix: + type: file + description: Coefficient matrix + pattern: "*.propr.tsv" + - fdr: + type: file + description: (optional) propr fdr table + pattern: "*.fdr.tsv" + - adj: + type: file + description: (optional) propr adjacency table + pattern: "*.adj.csv" + - session_info: + type: file + description: dump of R SessionInfo + pattern: "*.R_sessionInfo.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@suzannejin" diff --git a/modules/nf-core/propr/propr/templates/propr.R b/modules/nf-core/propr/propr/templates/propr.R new file mode 100644 index 00000000..a9dbd7bd --- /dev/null +++ b/modules/nf-core/propr/propr/templates/propr.R @@ -0,0 +1,419 @@ +#!/usr/bin/env Rscript + + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files +#' +#' @param file Input file +#' @param header Boolean. TRUE if first row is header. False without header. +#' @param row.names The first column is used as row names by default. +#' Otherwise, give another number. Or use NULL when no row.names are present. +#' +#' @return output Data frame +read_delim_flexible <- function(file, header = TRUE, row.names = 1, check.names = TRUE){ + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + mat <- read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) + + if ( (row.names == 'gene_id') & ('gene_name' %in% colnames(mat)) ){ + mat <- mat[, -which(colnames(mat) == 'gene_name')] + } else if ( (row.names == 'gene_name') & ('gene_id' %in% colnames(mat)) ){ + mat <- mat[, -which(colnames(mat) == 'gene_id')] + } + + return(mat) +} + +#' Check if a variable can be numeric or not +#' +#' @param x Input variable +#' @return True if it can be numeric, False otherwise +can_be_numeric <- function(x) { + stopifnot(is.atomic(x) || is.list(x)) # check if x is a vector + numNAs <- sum(is.na(x)) + numNAs_new <- suppressWarnings(sum(is.na(as.numeric(x)))) + return(numNAs_new == numNAs) +} + +#' Set the proper reference gene index. +#' This should be used for alr transformation only. +#' +#' @param ivar Reference variable given by user. +#' If it is 'null', then set the last column as reference (default). +#' Otherwise, it should refer to either gene name or gene index. +#' If the gene name is given, find its index. +#' @param mat Data matrix, with genes as columns +#' +#' @return The reference gene index +set_reference <- function(ivar, mat){ + if (is.na(ivar)){ + ivar <- ncol(mat) + } else { + isnumeric <- can_be_numeric(ivar) + if (!isnumeric){ + genes <- colnames(mat) + ivar <- which(genes == ivar) + } + ivar <- as.integer(ivar) + } + return(ivar) +} + +#' Set the appropiate range for the sequence of cutoffs used in updateCutoffs. +#' Adjusts the interval to the different metrics. +#' +#' @param object propr object. Output from propr function. +#' +#' @return sequence of cutoff values. +seqCutoff <- function(object){ + matrix <- getMatrix(object) + matrix[matrix <= 0] <- NA + diag(matrix) <- NA + min_cutoff <- round(min(matrix, na.rm = TRUE),3) + max_cutoff <- round(max(matrix, na.rm = TRUE),3) + step_cutoff <- (max_cutoff - min_cutoff)/ 20 + seq_cutoff <- seq(min_cutoff, max_cutoff, step_cutoff) + return(seq_cutoff) +} + +#' Extract the proportionality cutoff for a specified FDR value. +#' Gene pairs with a proportionality value higher than the extracted cutoff will be considered significantly proportional. +#' +#' @param object propr object. Output from propr function. updateCutoffs function should be applied to the object previous to valCutoff. +#' @param fdrVal FDR value to extract the cutoff for. Per default 0.05 +#' @param metric Metric used to calculate the proportionality values. Options are 'cor', 'rho', 'phi', 'phs', 'vlr', 'pcor', 'pcor.shrink', 'pcor.bshrink' +#' +#' @return cutoff value. Proportionality values higher than this cutoff are considered significant. +valCutoff <- function(object, metric, fdrVal = 0.05){ + fdr_df <- object@fdr + print(fdr_df) + # metric_up <- c("rho", "cor", "pcor", "pcor.shrink", "pcor.bshrink") + + if (prod(dim(fdr_df) == 0)){ + warning("Please run updateCutoff on propr first") + }else{ + fdr_vals <- fdr_df\$FDR + if(any(!is.na(fdr_vals))){ # if there is some defined value, continue, else out of range + if(any(fdr_vals <= fdrVal)){ # if there is some value that is belowe the FDR threshold, + fdr_threshold <- fdr_vals[which.max(fdr_vals <= fdrVal)] #choose the highest FDR that is lower than the threshold, else choose the lowest + }else{ + warning("FDR is higher than the specified threshold for all proportionality values. Using the lowest fdr instead") + fdr_threshold <- min(fdr_vals, na.rm = TRUE) + } + cutoff <- fdr_df\$cutoff[which(fdr_df\$FDR == fdr_threshold)] #select the corresponding cutoff value for the FDR + print(cutoff) + }else{ + stop("FDR not defined. This metric is not appropiate for the given dataset") + } + return(cutoff) + } +} + + +#' Convert a proportionality matrix to an adjacency matrix based on a threshold. +#' +#' @param matrix proportionality matrix. Can be extracted from propr object with getMatrix(). +#' @param cutoff Significant proportionality value extracted from valCutoff function. +#' +#' @return Adjacency matrix. Gene pairs with a proportionality value higher than the threshold will have 1, otherwise 0. +convert_to_adjacency <- function(matrix, cutoff, metric) { + if (metric == 'cor' || metric == 'rho' || metric == 'pcor' || metric == 'pcor.shrink' || metric == 'pcor.bshrink'){ + adjacency <- ifelse(matrix > cutoff, 1, 0) + } else { + adjacency <- ifelse(matrix < cutoff, 1, 0) + } + return(adjacency) +} + +################################################ +################################################ +## Parse arguments ## +################################################ +################################################ + +opt <- list( + count = '$count', + prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), + transformation = 'clr', + reference = NA, + alpha = NA, + metric = 'pcor.bshrink', + permutation = 0, + cutoff_min = NA, + cutoff_max = NA, + cutoff_interval = NA, + ncores = as.integer('$task.cpus'), + features_id_col = 'gene_id', + fixseed = FALSE, + adjacency = FALSE, + fdrVal = 0.05 +) +opt_types <- list( + count = 'character', + prefix = 'character', + transformation = 'character', + reference = 'character', + alpha = 'numeric', + metric = 'character', + permutation = 'numeric', + cutoff_min = 'numeric', + cutoff_max = 'numeric', + cutoff_interval = 'numeric', + ncores = 'numeric', + features_id_col = 'character', + fixseed = 'logical', + adjacency = 'logical', + fdrVal = 'numeric' +) + +# Apply parameter overrides +args_opt <- parse_args('$task.ext.args') + +for ( ao in names(args_opt)){ + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])){ + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + # set NA + if (args_opt[[ao]] %in% c('NA', NA, 'null')){ + args_opt[[ao]] <- NA + } + opt[[ao]] <- args_opt[[ao]] + } +} + +# Check if required parameters have been provided +required_opts <- c('count') +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] +if (length(missing) > 0){ + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} + +# Check file inputs are valid +for (file_input in c('count')){ + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + if (! file.exists(opt[[file_input]])){ + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } +} + +# check parameters +if (!opt\$metric %in% c('rho', 'phi', 'phs', 'cor', 'vlr', 'pcor', 'pcor.shrink', 'pcor.bshrink')) { + stop('Please make sure you provided the correct metric') +} +if (opt\$metric == 'pcor.bshrink'){ + if (!is.na(opt\$alpha)) stop('Box-cox transformation is not implemented for pcor.bshrink yet.') + if (!opt\$transformation %in% c('clr', 'alr')) stop('Please make sure you provided the correct transformation: clr or alr') +} else { + if (!opt\$transformation %in% c('clr', 'alr', NA)) stop('Please make sure you provided the correct transformation: clr or alr. Or set NA if you dont want to transform the data.') + if (is.na(opt\$transformation)) print('Warning: No transformation is required by user. We assume the input count data was already properly transformed.') +} + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ + +library(corpcor) +library(propr) + +################################################ +################################################ +## Perform correlation analysis ## +################################################ +################################################ + +# read matrix +mat <- read_delim_flexible( + opt\$count, + header = TRUE, + row.names = opt\$features_id_col, + check.names = FALSE +) +mat <- t(mat) + +# check zeros +# log transformation should be applied on non-zero data +# otherwise Inf values are generated +if (any(mat == 0)) print("Warning: Zeros will be replaced by minimun value before logratio analysis") + +# set logratio transformation parameter -> ivar +# if alr, set the index of the reference gene as ivar +# if clr or NA, set same ivar +if (opt\$metric == 'pcor.bshrink'){ + opt\$ivar <- opt\$transformation +}else{ + if (opt\$transformation == 'alr'){ + opt\$ivar <- set_reference(opt\$reference, mat) + gene_name <- colnames(mat)[opt\$ivar] + } else { + opt\$ivar <- opt\$transformation + } +} + +# Compute correlation coefficients +pro <- propr( + mat, + metric = opt\$metric, + ivar = opt\$ivar, + alpha = opt\$alpha, + p = opt\$permutation, + fixseed = opt\$fixseed +) + +# update FDR by permutation, if required + +if (opt\$permutation > 0) { + cutoff <- seq( + opt\$cutoff_min, + opt\$cutoff_max, + opt\$cutoff_interval + ) + if (is.na(opt\$cutoff_min) || is.na(opt\$cutoff_max) || is.na(opt\$cutoff_interval)) { + warning("cutoff values were not provided. Using the default cutoff values.") + cutoff <- seqCutoff(pro) + } + m <- getMatrix(pro) + diag(m) <- NA + print((opt\$cutoff_max - opt\$cutoff_min)/2 + opt\$cutoff_min) + print(max(m, na.rm = TRUE)) + if ((opt\$cutoff_max - opt\$cutoff_min)/2 + opt\$cutoff_min > max(m, na.rm = TRUE)) { + warning("The provided cutoff values are out of range. Using the default cutoff values.") + cutoff <- seqCutoff(pro) + } + pro <- updateCutoffs(pro, cutoff=cutoff, ncores=opt\$ncores) +} + +# calculate cutoff and adjacency matrix, if required + +if (opt\$adjacency == TRUE) { + cutoff <- valCutoff(pro, opt\$metric, opt\$fdrVal) + matrix <- getMatrix(pro) + adj <- convert_to_adjacency(matrix, cutoff, opt\$metric) +} + +################################################ +################################################ +## Generate outputs ## +################################################ +################################################ + +saveRDS( + pro, + file = paste0(opt\$prefix, '.propr.rds') +) + +write.table( + round(pro@matrix, 8), # round matrix decimals to avoid floating point inconsistencies + file = paste0(opt\$prefix, '.propr.tsv'), + col.names = TRUE, + row.names = TRUE, + sep = '\t', + quote = FALSE +) + +if (opt\$permutation > 0) { + write.table( + pro@fdr, + file = paste0(opt\$prefix, '.fdr.tsv'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) +} + +if (opt\$adjacency == TRUE) { + write.table( + adj, + file = paste0(opt\$prefix, '.adj.csv'), + col.names = TRUE, + row.names = TRUE, + sep = ',', + quote = FALSE + ) +} + +################################################ +################################################ +## WARNINGS ## +################################################ +################################################ + +sink(paste0(opt\$prefix, ".warnings.log")) +print(warnings()) +sink() + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink(paste0(opt\$prefix, ".R_sessionInfo.log")) +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +propr.version <- as.character(packageVersion('propr')) + +writeLines( + c( + '"${task.process}":', + paste(' r-propr:', propr.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/nf-core/propr/propr/tests/adjacency.config b/modules/nf-core/propr/propr/tests/adjacency.config new file mode 100644 index 00000000..cc0b3894 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/adjacency.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--metric rho --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.1 --fixseed true --adjacency true"} +} diff --git a/modules/nf-core/propr/propr/tests/adjacency_pcor.config b/modules/nf-core/propr/propr/tests/adjacency_pcor.config new file mode 100644 index 00000000..0dda6b73 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/adjacency_pcor.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--metric pcor.bshrink --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.1 --fixseed true --adjacency true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/adjacency_phs.config b/modules/nf-core/propr/propr/tests/adjacency_phs.config new file mode 100644 index 00000000..923570b4 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/adjacency_phs.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--metric phs --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.1 --fixseed true --adjacency true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/adjacency_rho.config b/modules/nf-core/propr/propr/tests/adjacency_rho.config new file mode 100644 index 00000000..cc0b3894 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/adjacency_rho.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--metric rho --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.1 --fixseed true --adjacency true"} +} diff --git a/modules/nf-core/propr/propr/tests/alr_pcorbshrink.config b/modules/nf-core/propr/propr/tests/alr_pcorbshrink.config new file mode 100644 index 00000000..c0c3b211 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/alr_pcorbshrink.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--transformation alr --metric pcor.bshrink --permutation 10 --cutoff_min 0.001 --cutoff_max 0.01 --cutoff_interval 0.001 --fixseed true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/clr_pcor.config b/modules/nf-core/propr/propr/tests/clr_pcor.config new file mode 100644 index 00000000..2e6d1b19 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/clr_pcor.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--transformation clr --metric pcor --permutation 10 --cutoff_min 0.001 --cutoff_max 0.01 --cutoff_interval 0.001 --fixseed true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/clr_pcorbshrink.config b/modules/nf-core/propr/propr/tests/clr_pcorbshrink.config new file mode 100644 index 00000000..435440c8 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/clr_pcorbshrink.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--transformation clr --metric pcor.bshrink --permutation 10 --cutoff_min 0.001 --cutoff_max 0.01 --cutoff_interval 0.001 --fixseed true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/clr_rho.config b/modules/nf-core/propr/propr/tests/clr_rho.config new file mode 100644 index 00000000..6b32ac27 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/clr_rho.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--transformation clr --metric rho --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.1 --fixseed true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/clr_rho_alpha.config b/modules/nf-core/propr/propr/tests/clr_rho_alpha.config new file mode 100644 index 00000000..7f151a80 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/clr_rho_alpha.config @@ -0,0 +1,3 @@ +process { + ext.args = {"--transformation alr --alpha 0.2 --metric rho --permutation 10 --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.1 --fixseed true"} +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/main.nf.test b/modules/nf-core/propr/propr/tests/main.nf.test new file mode 100644 index 00000000..e262d2cf --- /dev/null +++ b/modules/nf-core/propr/propr/tests/main.nf.test @@ -0,0 +1,240 @@ +nextflow_process { + + name "Test Process PROPR_PROPR" + script "../main.nf" + process "PROPR_PROPR" + + tag "modules" + tag "modules_nfcore" + tag "propr" + tag "propr/propr" + + test("Test propr/propr using default options") { + + tag "default" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + //file(params.test_data['mus_musculus']['genome']['rnaseq_matrix'], checkIfExists: true) + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr using default options - matrix") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("Test propr/propr while running clr+pcor.bshrink explicitly") { + + tag "clr_pcorbshrink" + config "./clr_pcorbshrink.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['mus_musculus']['genome']['rnaseq_matrix'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr while running clr+pcor.bshrink explicitly - matrix")}, + { assert snapshot(process.out.fdr).match("Test propr/propr while running clr+pcor.bshrink explicitly - fdr")} + ) + } + } + + test("Test propr/propr while running alr+pcor.bshrink") { + + tag "alr_pcorbshrink" + config "./alr_pcorbshrink.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['mus_musculus']['genome']['rnaseq_matrix'], checkIfExists: true) + ] + """ + } + } + + then { + // TODO also check that the first columns are the same as the above processes + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr while running alr+pcor.bshrink - matrix") }, + { assert snapshot(process.out.fdr).match("Test propr/propr while running alr+pcor.bshrink - fdr") } + ) + } + } + + test("Test propr/propr while running clr+rho") { + + tag "clr_rho" + config "./clr_rho.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['mus_musculus']['genome']['rnaseq_matrix'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr while running clr+rho - matrix") }, + { assert snapshot(process.out.fdr).match("Test propr/propr while running clr+rho - fdr") } + ) + } + } + + test("Test propr/propr while running clr+pcor") { + + tag "clr_pcor" + config "./clr_pcor.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['mus_musculus']['genome']['rnaseq_matrix'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr while running clr+pcor - matrix") }, + { assert snapshot(process.out.fdr).match("Test propr/propr while running clr+pcor - fdr") } + ) + } + } + + test("Test propr/propr while running clr+rho with boxcox transformation") { + + tag "clr_rho_alpha" + config "./clr_rho_alpha.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['mus_musculus']['genome']['rnaseq_matrix'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr while running clr+rho with boxcox transformation - matrix") }, + { assert snapshot(process.out.fdr).match("Test propr/propr while running clr+rho with boxcox transformation - fdr") } + ) + } + } + + test("Test propr/propr calculating adjacency_matrix for rho") { + + tag "adjacency_rho" + config "./adjacency_rho.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr calculating adjacency_matrix for rho - matrix") }, + { assert snapshot(process.out.fdr).match("Test propr/propr calculating adjacency_matrix for rho - fdr") }, + { assert snapshot(process.out.adj).match("Test propr/propr calculating adjacency_matrix for rho - adj") } + ) + } + } + + test("Test propr/propr calculating adjacency_matrix for phs") { + + tag "adjacency_phs" + config "./adjacency_phs.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr calculating adjacency_matrix for phs - matrix") }, + { assert snapshot(process.out.fdr).match("Test propr/propr calculating adjacency_matrix for phs - fdr") }, + { assert snapshot(process.out.adj).match("Test propr/propr calculating adjacency_matrix for phs - adj") } + ) + } + } + + test("Test propr/propr calculating adjacency_matrix for pcor") { + + tag "adjacency_pcor" + config "./adjacency_pcor.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.matrix).match("Test propr/propr calculating adjacency_matrix for pcor - matrix") }, + { assert snapshot(process.out.fdr).match("Test propr/propr calculating adjacency_matrix for pcor - fdr") }, + { assert snapshot(process.out.adj).match("Test propr/propr calculating adjacency_matrix for pcor - adj") } + ) + } + } +} diff --git a/modules/nf-core/propr/propr/tests/main.nf.test.snap b/modules/nf-core/propr/propr/tests/main.nf.test.snap new file mode 100644 index 00000000..cde969bc --- /dev/null +++ b/modules/nf-core/propr/propr/tests/main.nf.test.snap @@ -0,0 +1,354 @@ +{ + "Test propr/propr while running clr+pcor - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,c0147540e44c63a439c28d24315bbc83" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:22:00.577754" + }, + "Test propr/propr calculating adjacency_matrix for rho - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,32be6cc96d2d5563f779d2eec9c7ed34" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:23:04.464473" + }, + "Test propr/propr calculating adjacency_matrix for pcor - adj": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.adj.csv:md5,d5ba274e1313177ddf28881b036885c5" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:24:45.008049" + }, + "Test propr/propr while running alr+pcor.bshrink - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,d446cc30af27643df8555b2924e3c0a1" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:20:50.487477" + }, + "Test propr/propr using default options - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,1aed2bed28b3f6cff481065f535e6d24" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:19:27.698729" + }, + "Test propr/propr calculating adjacency_matrix for phs - adj": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.adj.csv:md5,1cb74934c2dbb93ad6416b5410dee1c4" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:23:45.966557" + }, + "Test propr/propr while running clr+pcor.bshrink explicitly - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,1aed2bed28b3f6cff481065f535e6d24" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:20:07.836881" + }, + "Test propr/propr while running clr+pcor - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,5bd6a9c2d5899dedfec8ebfbabeb532e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:21:59.693747" + }, + "Test propr/propr while running alr+pcor.bshrink - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,e2cbc066fa7635e9ea4ec198987d11d0" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:20:49.898429" + }, + "Test propr/propr calculating adjacency_matrix for pcor - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,08d35cc8a7f5dd39ea4e3471cee7b2ec" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:24:43.724784" + }, + "Test propr/propr while running clr+rho - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,32be6cc96d2d5563f779d2eec9c7ed34" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:21:19.887004" + }, + "Test propr/propr while running clr+rho - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,53865e6c2b05e022277df6dc1188c461" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:21:19.194321" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,2e3159924c190ab42e22a4d0e192b1e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:19:27.963267" + }, + "Test propr/propr calculating adjacency_matrix for phs - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,d7e540af9ed2d59ffbc69caae819648d" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:23:44.841504" + }, + "Test propr/propr calculating adjacency_matrix for pcor - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,1aed2bed28b3f6cff481065f535e6d24" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:24:41.975415" + }, + "Test propr/propr while running clr+rho with boxcox transformation - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,a0fc0d01dddb4bd285306766eefeff67" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:22:29.830787" + }, + "Test propr/propr calculating adjacency_matrix for rho - adj": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.adj.csv:md5,e1b6e50216976397655066f17f0703de" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:23:05.438967" + }, + "Test propr/propr while running clr+rho with boxcox transformation - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,e30643e340aa88e3acd4c181f9e4ba81" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:22:28.793097" + }, + "Test propr/propr calculating adjacency_matrix for rho - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,53865e6c2b05e022277df6dc1188c461" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:23:03.240604" + }, + "Test propr/propr while running clr+pcor.bshrink explicitly - fdr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fdr.tsv:md5,dc9e6f135064363c503e83105a9c9b69" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:20:08.230977" + }, + "Test propr/propr calculating adjacency_matrix for phs - matrix": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.propr.tsv:md5,67ba22131ebd747404a3037f922a77f5" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-05T11:23:43.441629" + } +} \ No newline at end of file diff --git a/modules/nf-core/propr/propr/tests/tags.yml b/modules/nf-core/propr/propr/tests/tags.yml new file mode 100644 index 00000000..539706d4 --- /dev/null +++ b/modules/nf-core/propr/propr/tests/tags.yml @@ -0,0 +1,2 @@ +propr/propr: + - "modules/nf-core/propr/propr/**" diff --git a/nextflow_coda.config b/nextflow_coda.config new file mode 100644 index 00000000..23d475f4 --- /dev/null +++ b/nextflow_coda.config @@ -0,0 +1,55 @@ +//parametros generales, para el usuario (que se pueden arrancar poniendo los flags como el command line) + +params.tools = "./assets/tools_samplesheet.csv" +params.outdir = "../results_pipeline" +params.validationSkipDuplicateCheck = true +params.publish_dir_mode = 'copy' +params.pathway = 'all' +//params.maxRetries = 0 +includeConfig 'conf/modules.config' // now it should refer to modules_coda.config + + + +profiles { + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + process.debug = true + cleanup = false + nextflow.enable.configProcessNamesValidation = true + } + crg { includeConfig "conf/crg.config" } +} + +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } +} diff --git a/nextflow_schema_coda.json b/nextflow_schema_coda.json new file mode 100644 index 00000000..cba300f4 --- /dev/null +++ b/nextflow_schema_coda.json @@ -0,0 +1,25 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "title": "nf-core/testpipeline pipeline parameters", + "description": "this is a test", + "type": "object", + "definitions": { + "tools_options": { + "title": "Tools options", + "type": "object", + "description": "Define where the pipeline should find input data", + "required": ["tools"], + "properties": { + "tools": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "schema": "assets/schema_tools.json", + "pattern": "^\\S+\\.(csv|tsv|yaml)$", + "description": "Path to comma-separated file containing samplesheet", + "help_text": "this is just a test" + } + } + } + } +} diff --git a/subworkflows/local/correlation/main.nf b/subworkflows/local/correlation/main.nf new file mode 100644 index 00000000..5bcaa808 --- /dev/null +++ b/subworkflows/local/correlation/main.nf @@ -0,0 +1,48 @@ + +// include nf-core modules +include {PROPR_PROPR as PROPR} from "../../modules/nf-core/propr/propr/main.nf" + + +workflow CORRELATION { + take: + ch_counts + ch_tools + ch_counts_filtered + + main: + ch_counts + .combine(ch_tools) + .map { + metacounts, counts, metatools -> + [ metacounts+metatools, counts ] + } + .branch { + propr: it[0]["cor_method"] == "propr" + } + .set { ch_counts_cor } + + // Hacer un branch del channel para coger las counts normales cuando no hay variable selection + + ch_counts_cor.propr + .branch{ + no_sel: it[0]["sel_method"] == null + sel: it[0]["sel_method"] != null + } + .set { ch_counts_selection } + + //ch_counts_selection.no_sel.view() + //ch_counts_filtered.view() + + ch_propr = ch_counts_filtered.mix(ch_counts_selection.no_sel) + //ch_propr.view() + + PROPR(ch_propr) + ch_matrix = PROPR.out.matrix + ch_adjacency = PROPR.out.adj + + + emit: + matrix = ch_matrix + adjacency = ch_adjacency + +} diff --git a/subworkflows/local/differential/main.nf b/subworkflows/local/differential/main.nf new file mode 100644 index 00000000..ccea968a --- /dev/null +++ b/subworkflows/local/differential/main.nf @@ -0,0 +1,33 @@ +// include modules +include {PROPR_PROPD as PROPD} from "../../modules/nf-core/propr/propd/main.nf" + + +workflow DIFFERENTIAL { + take: + ch_counts + ch_tools + ch_samplesheet + + main: + ch_counts + .combine(ch_tools) + .map { + metacounts, counts, meta -> + [ metacounts+meta, counts ] + } + //.view() + .branch { + propd: it[0]["diff_method"] == "propd" + deseq2: it[0]["diff_method"] == "deseq2" + } + .set { ch_counts_tools } + + PROPD(ch_counts_tools.propd, ch_samplesheet) + ch_results = PROPD.out.results + ch_adjacency = PROPD.out.adj + + emit: + results = ch_results + adjacency = ch_adjacency + +} diff --git a/subworkflows/local/enrichment/main.nf b/subworkflows/local/enrichment/main.nf new file mode 100644 index 00000000..b19f96fc --- /dev/null +++ b/subworkflows/local/enrichment/main.nf @@ -0,0 +1,46 @@ +// include modules + +include { PROPR_GREA as GREA_DIFF } from "../../modules/nf-core/propr/grea/main.nf" +include { PROPR_GREA as GREA_COR } from "../../modules/nf-core/propr/grea/main.nf" +include { MYGENE } from "../../modules/nf-core/mygene/main.nf" + + +workflow ENRICHMENT { + take: + ch_diff_adjacency + ch_cor_adjacency + ch_counts + + + main: + + MYGENE(ch_counts) + ch_gmt = MYGENE.out.gmt + + + ch_diff_adjacency + .branch { + grea: it[0]["enr_diff_method"] == "grea" + gsea: it[0]["enr_diff_method"] == "gsea" + } + .set { ch_diff_grea } + + GREA_DIFF(ch_diff_grea.grea, ch_gmt.collect()) + ch_enriched_diff = GREA_DIFF.out.enrichedGO + + ch_cor_adjacency + .branch { + grea: it[0]["enr_cor_method"] == "grea" } + .set { ch_cor_grea } + + ch_cor_grea.grea.view() + ch_diff_grea.grea.view() + + GREA_COR(ch_cor_grea.grea, ch_gmt.collect()) + ch_enriched_cor = GREA_COR.out.enrichedGO + + + emit: + enriched_diff = ch_enriched_diff + enriched_cor = ch_enriched_cor +} diff --git a/subworkflows/local/experimental/main.nf b/subworkflows/local/experimental/main.nf new file mode 100644 index 00000000..6e811027 --- /dev/null +++ b/subworkflows/local/experimental/main.nf @@ -0,0 +1,73 @@ +// include subworkflows + +include { CORRELATION } from '../correlation/main.nf' +include { DIFFERENTIAL } from '../differential/main.nf' +include { VARIABLE_SELECTION } from '../variable_selection/main.nf' +include { ENRICHMENT } from '../enrichment/main.nf' + + +workflow EXPERIMENTAL { + take: + ch_samples_and_matrix // [meta, samplesheet, matrix] que viene de differentialabundance + ch_tools + + + main: + // Dividir el ch_samples_and_matrix en un channel de samplesheet y otro de matrix (PROPD los coge por separado) + ch_samples_and_matrix + .map { + meta, samplesheet, counts -> + [ meta, samplesheet ] + } + .set { ch_samplesheet } + + ch_samples_and_matrix + .map { + meta, samplesheet, counts -> + [ meta, counts ] + } + .set { ch_counts } + // ch_counts.view() + + ch_counts + .combine(ch_tools) + .map { + metacounts, counts, metatools -> + [ metacounts+metatools, counts ] + } + .set { ch_counts_tools } + + // Perform CODA analysis + ch_out = Channel.empty() + + // Perform differential analysis + + DIFFERENTIAL(ch_counts, ch_tools, ch_samplesheet.collect()) + ch_diff_results = DIFFERENTIAL.out.results + ch_diff_adjacency = DIFFERENTIAL.out.adjacency + + // Perform variable selection + ch_counts_filtered = VARIABLE_SELECTION(ch_diff_adjacency, ch_counts) + + //VARIABLE_SELECTION.out.count.view() + + // Perform correlation analysis + CORRELATION(ch_counts, ch_tools, ch_counts_filtered) + ch_matrix = CORRELATION.out.matrix + ch_cor_adjacency = CORRELATION.out.adjacency + ch_out.mix(ch_matrix) + + //ch_diff_adjacency.view() + //ch_cor_adjacency.view() + + // Perform enrichment analysis + ENRICHMENT(ch_diff_adjacency, ch_cor_adjacency, ch_counts) + ch_enriched_cor = ENRICHMENT.out.enriched_cor + ch_enriched_diff = ENRICHMENT.out.enriched_diff + + ch_out.mix(ch_enriched_diff, ch_enriched_cor) + + + emit: + output = ch_out +} diff --git a/subworkflows/local/variable_selection/main.nf b/subworkflows/local/variable_selection/main.nf new file mode 100644 index 00000000..f914473c --- /dev/null +++ b/subworkflows/local/variable_selection/main.nf @@ -0,0 +1,40 @@ +// include modules +include { FILTERVAR } from "../../modules/local/filtervar/main.nf" + +workflow VARIABLE_SELECTION { + take: + ch_adj//meta_tools, adj + ch_counts //meta_id, counts + + + main: + ch_counts + .map { + metacounts, counts -> + [counts] + } + .combine(ch_adj) + //.view() + .map{ + counts, meta, adj -> + [ meta, counts, adj] + } + //.view() + .branch { + filtervar: it[0]["sel_method"] == "filtervar" + deseqfilter: it[0]["sel_method"] == "deseqfilter" + } + .set { ch_counts_adj_sel } + + //ch_counts_adj_sel.nofilter.view() + + + FILTERVAR(ch_counts_adj_sel.filtervar) + + ch_counts_cor = FILTERVAR.out.count + //ch_counts_cor.view() + + + emit: + count = ch_counts_cor +}