diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..3d4eb22 --- /dev/null +++ b/.lintr @@ -0,0 +1,2 @@ +linters: linters_with_defaults() # see vignette("lintr") +encoding: "UTF-8" diff --git a/DESCRIPTION b/DESCRIPTION index e817b19..ac53331 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: GOeval Title: Gene Regulatory Network Evaluation Using Gene Ontology -Version: 0.0.0.9000 +Version: 1.0.0 Authors@R: person("Thomas", "Westbrook", , "westbrookt@wustl.edu", role = c("aut", "cre"), comment = c(ORCID = "YOUR-ORCID-ID")) diff --git a/R/evaluate.R b/R/evaluate.R index 571dba0..979366b 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -1,88 +1,158 @@ - #' Perform the whole GOeval pipeline on one network #' +#' @importFrom utils read.table +#' #' @description #' Given one network, `evaluate` will first make subsets with `subset_network`, #' then run Over-Representation Analysis (ORA) with `webgestalt_network`, -#' followed by `get_metrics` and `plot_metrics` to plot selected summary statistics. +#' followed by `get_metrics` and `plot_metrics` to plot selected summary +#' statistics. #' -#' @param network path to the file containing the network to create subsets from. The file should -#' be tab-separated with three columns: source node, target node, edge score -#' @param reference_set path to the set of all genes possibly included in the network. Must be a .txt -#' file containing exactly one column of the genes that could possibly appear in the network. -#' @param output_directory path to the directory in which to store the generated network subsets, -#' ORA summaries, and plots -#' @param network_name short name for the network - used in file naming so may not contain spaces +#' @param network path to the file containing the network to create subsets +#' from. The file should be tab-separated with three columns: source node, +#' target node, edge score +#' @param reference_set path to the set of all genes possibly included in the +#' network. Must be a .txt file containing exactly one column of the genes that +#' could possibly appear in the network. +#' @param output_directory path to the directory in which to store the generated +#' network subsets, ORA summaries, and plots +#' @param network_name short name for the network - used in file naming so may +#' not contain spaces #' @param organism a string specifying the organism that the data is from, e.g. #' "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism() -#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet(). -#' Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE. -#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType() -#' and see webgestalt.org for examples of each type -#' @param edges list of total numbers of edges or average edges per TF to include in each subset -#' @param num_possible_TFs if set to a number > 0, the elements of 'edges' will first be -#' multiplied by this number to get the number of edges for each subset -#' @param permutations the number of randomly permuted networks to create and run ORA on -#' @param penalty the penalty applied to the 'sum' metric for each TF in the network -#' @param fdr_threshold the FDR threshold for a gene set term to be considered significantly -#' over-represented for the purposes of calculating the 'percent' metric -#' @param get_sum boolean whether to get and plot the 'sum' metric, which is the sum of the negative -#' log base 10 of the p-value for the top term of each source node minus 'penalty' times the total -#' number of source nodes. -#' @param get_percent boolean whether to get and plot the 'percent' metric, which is the -#' percent of source nodes with at least one term with a FDR below the 'fdr_threshold' -#' @param get_mean boolean whether to get and plot the 'mean' metric, which is the mean negative -#' log base 10 of the p-value for the top term of each source node regardless of significance -#' @param get_median boolean whether to get and plot the 'median' metric, which is the median negative -#' log base 10 of the p-value for the top term of each source node regardless of significance -#' @param get_annotation_overlap boolean whether to get and plot the 'annotation_overlap' metric, -#' which is the percent of source nodes that are annotated to at least one of the 16 GO terms for -#' which their target genes are most enriched -#' @param get_size boolean whether to get and plot the 'size' metric, which is the number of -#' source nodes in the network subset that have more than one target gene with annotations. This number is -#' used in the calculation of all other metrics. -#' @param plot boolean whether to make plots of the calculated metrics and write them to a pdf +#' @param database the gene set database to search for enrichment - see options +#' with WebGestaltR::listGeneSet(). Must be a Gene Ontology +#' "biological process" database if get_annotation_overlap = TRUE. +#' @param gene_id the naming system used for the input genes - see options with +#' WebGestaltR::listIdType() and see webgestalt.org for examples of each type +#' @param edges list of total numbers of edges or average edges per TF to +#' include in each subset +#' @param num_possible_TFs if set to a number > 0, the elements of 'edges' will +#' first be multiplied by this number to get the number of edges for each +#' subset +#' @param permutations the number of randomly permuted networks to create and +#' run ORA on +#' @param penalty the penalty applied to the 'sum' metric for each TF in the +#' network +#' @param fdr_threshold the FDR threshold for a gene set term to be considered +#' significantly over-represented for the purposes of calculating the 'percent' +#' metric +#' @param get_sum boolean whether to get and plot the 'sum' metric, which is the +#' sum of the negative log base 10 of the p-value for the top term of each +#' source node minus 'penalty' times the total number of source nodes. +#' @param get_percent boolean whether to get and plot the 'percent' metric, +#' which is the percent of source nodes with at least one term with a FDR below +#' the 'fdr_threshold' +#' @param get_mean boolean whether to get and plot the 'mean' metric, which is +#' the mean negative log base 10 of the p-value for the top term of each source +#' node regardless of significance +#' @param get_median boolean whether to get and plot the 'median' metric, which +#' is the median negative log base 10 of the p-value for the top term of each +#' source node regardless of significance +#' @param get_annotation_overlap boolean whether to get and plot the +#' 'annotation_overlap' metric, which is the percent of source nodes that are +#' annotated to at least one of the 16 GO terms for which their target genes +#' are most enriched +#' @param get_size boolean whether to get and plot the 'size' metric, which is +#' the number of source nodes in the network subset that have more than one +#' target gene with annotations. This number is used in the calculation of all +#' other metrics. +#' @param plot boolean whether to make plots of the calculated metrics and write +#' them to a pdf #' #' @details -#' The input file should be tab-separated with two or three columns: source node (e.g. transcription factor), -#' target node (e.g. the regulated gene), and, optionally, edge score. +#' The input file should be tab-separated with two or three columns: source node +#' (e.g. transcription factor), target node (e.g. the regulated gene), and, +#' optionally, edge score. #' #' @return output of `get_metrics`. Can be used as input to `plot_metrics`. #' #' @export -evaluate <- function(network, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", - edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0, permutations = 3, penalty = 3, fdr_threshold = 0.05, - get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) { +evaluate <- function(network, reference_set, output_directory, network_name, + organism = "hsapiens", + database = "geneontology_Biological_Process_noRedundant", + gene_id = "ensembl_gene_id", + edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), + num_possible_TFs = 0, permutations = 3, penalty = 3, + fdr_threshold = 0.05, get_sum = TRUE, get_percent = FALSE, + get_mean = FALSE, get_median = FALSE, + get_annotation_overlap = FALSE, get_size = TRUE, + plot = TRUE) { + + # check existence and format of `reference_set` before first function call + if(!file.exists(reference_set)) { + stop("Reference set file not found.") + } + + ref_first_line <- tryCatch({ + read.table(reference_set, nrows = 1) + }, error = function(e) { + stop("Error reading the reference set file.") + }) + + num_columns <- ncol(ref_first_line) + if (num_columns != 1) { + stop("Reference set file should contain only one column.") + } # first package function - subset_network(network, file.path(output_directory, paste0(network_name, "_subsets")), network_name, edges, num_possible_TFs) + # existence and format of `network` and `network_name` checked here + subset_network(network, file.path(output_directory, + paste0(network_name, "_subsets")), + network_name, edges, num_possible_TFs) # for unique names to avoid overwriting formatted_time <- format(Sys.time(), "%Y-%m-%d-%H%M") summaries_suffix <- paste0("_summaries_", formatted_time) # sequential loop over all created subsets - for (subset in list.files(file.path(output_directory, paste0(network_name, "_subsets")), full.names = TRUE)) { - # second package function - webgestalt_network(network_path = subset, - reference_set = reference_set, - output_directory = file.path(output_directory, paste0(network_name, summaries_suffix)), - # this just gets the name of each file minus the extension - network_name = strsplit(basename(subset), "[.]")[[1]][1], - organism = organism, - database = database, - gene_id = gene_id, - permutations = permutations) + for (subset in list.files(file.path(output_directory, paste0(network_name, + "_subsets")), + full.names = TRUE)) { + # second package function + webgestalt_network( + network_path = subset, + reference_set = reference_set, + output_directory = file.path(output_directory, paste0(network_name, + summaries_suffix)), + # this just gets the name of each file minus the extension + network_name = strsplit(basename(subset), "[.]")[[1]][1], + organism = organism, + database = database, + gene_id = gene_id, + permutations = permutations + ) } # third package function # mapply returns a list of length 1 containing the output of get_metrics - metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, summaries_suffix)), MoreArgs=list(organism = organism, database = database, gene_id = gene_id, get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, penalty = penalty, fdr_threshold = fdr_threshold, parallel = FALSE), SIMPLIFY = FALSE) + metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, + paste0(network_name, + summaries_suffix)), + MoreArgs = list(organism = organism, + database = database, + gene_id = gene_id, + get_sum = get_sum, + get_percent = get_percent, + get_mean = get_mean, + get_median = get_median, + get_annotation_overlap = get_annotation_overlap, + get_size = get_size, + penalty = penalty, + fdr_threshold = fdr_threshold, + parallel = FALSE), + SIMPLIFY = FALSE) if (plot) { - pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots_", formatted_time, ".pdf")), 7, 5) + pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots_", + formatted_time, ".pdf")), 7, 5) # fourth package function - plot_data <- plot_metrics(metric_dfs_by_net, network_name, "", perTF = (num_possible_TFs > 0), sum = get_sum, percent = get_percent, mean = get_mean, median = get_median, annotation_overlap = get_annotation_overlap, size = get_size) + plot_data <- plot_metrics(metric_dfs_by_net, network_name, "", + perTF = (num_possible_TFs > 0), sum = get_sum, + percent = get_percent, mean = get_mean, + median = get_median, + annotation_overlap = get_annotation_overlap, + size = get_size) dev.off() } diff --git a/R/get_metrics.R b/R/get_metrics.R index 8f0ab93..cb9fdd1 100644 --- a/R/get_metrics.R +++ b/R/get_metrics.R @@ -100,7 +100,7 @@ get_network_metrics <- function(full_terms, network_size, organism, gene_id, get neglogp <- -log10(terms$pValue) # cap -logp values due to rounding to 0 for pval < 1E-16 neglogp <- ifelse(is.finite(neglogp), neglogp, 16) - #neglogp_zeros <- append(rep(0, network_size - length(terms$geneSet)), neglogp) + # neglogp_zeros <- append(rep(0, network_size - length(terms$geneSet)), neglogp) if (get_annotation_overlap) { prior_ann <- 100 * length(get_annotation_overlap(full_terms, organism, gene_id, go_ann = go_ann, go_reg = go_reg)) / network_size } else { @@ -167,13 +167,18 @@ get_network_metrics <- function(full_terms, network_size, organism, gene_id, get #' #' @export get_metrics <- function(directory, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, penalty = 3, fdr_threshold = 0.05, parallel = FALSE) { + + if (!dir.exists(directory)) { + stop("Directory not found.") + } + # GO regulatory relationships only needed if get_annotation_overlap = TRUE if (get_annotation_overlap) { # Load regulatory relationships between GO terms for the calculation of overlap between TF GO # annotations and their targets' enriched GO terms. go <- as.data.frame(ontologyIndex::get_ontology(file = "http://purl.obolibrary.org/obo/go/go-basic.obo", extract_tags = "everything")) - go <- go[go$namespace == 'biological_process',] - go_reg <- go[c('id', 'regulates', 'negatively_regulates', 'positively_regulates')] + go <- go[go$namespace == "biological_process", ] + go_reg <- go[c("id", "regulates", "negatively_regulates", "positively_regulates")] rm(go) # Load gene annotations with WebGestaltR::loadGeneSet. @@ -200,7 +205,7 @@ get_metrics <- function(directory, organism = "hsapiens", database = "geneontolo # but also sometimes it works, so possible race condition although I'm not sure how results <- parallel::mclapply(networks_list, function(net) { network_data_files <- list.files(net, recursive = TRUE, pattern = "network_data.txt", full.names = TRUE) - network_sizes <- lapply(network_data_files, function (f) { + network_sizes <- lapply(network_data_files, function(f) { size_line <- grep("valid", readLines(f), value = TRUE) return(as.integer(unlist(strsplit(size_line, " "))[1])) }) @@ -212,7 +217,7 @@ get_metrics <- function(directory, organism = "hsapiens", database = "geneontolo # non-parallel version results <- lapply(networks_list, function(net) { network_data_files <- list.files(net, recursive = TRUE, pattern = "network_data.txt", full.names = TRUE) - network_sizes <- lapply(network_data_files, function (f) { + network_sizes <- lapply(network_data_files, function(f) { size_line <- grep("valid", readLines(f), value = TRUE) return(as.integer(unlist(strsplit(size_line, " "))[1])) }) diff --git a/R/plot_metrics.R b/R/plot_metrics.R index 4e717f1..efb7ea0 100644 --- a/R/plot_metrics.R +++ b/R/plot_metrics.R @@ -38,16 +38,19 @@ #' #' @export plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perTF, sum = TRUE, percent = FALSE, mean = FALSE, median = FALSE, annotation_overlap = FALSE, size = TRUE) { + + ### Should check format of `metric_dfs_by_net` + # determine if any of the input networks have only one size # if yes, metrics for permuted networks will be shown with box plots instead of line plots plot_with_boxes <- FALSE if (inherits(metric_dfs_by_net[[1]], "list")) { for (df_list in metric_dfs_by_net) { - if (length(df_list[[1]][1,]) == 1) { + if (length(df_list[[1]][1, ]) == 1) { plot_with_boxes <- TRUE } } - } else if (length(metric_dfs_by_net[[1]][1,]) == 1) { + } else if (length(metric_dfs_by_net[[1]][1, ]) == 1) { plot_with_boxes <- TRUE metric_dfs_by_net <- list(metric_dfs_by_net) } else { @@ -62,9 +65,11 @@ plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perT # assemble y-axis labels to_plot <- c(sum, percent, mean, median, annotation_overlap, size) - label_text <- c("Sum of (top -log(p-value) - 3) across TFs", "Percent of TFs with a significant GO term (FDR < 0.05)", - "Mean of top -log(p-value)", "Median of top -log(p-value)", "Percent of TFs annotated to a significant GO term (FDR < 0.05)", - "Number of TFs with > 1 annotated target gene")[to_plot] + label_text <- c( + "Sum of (top -log(p-value) - 3) across TFs", "Percent of TFs with a significant GO term (FDR < 0.05)", + "Mean of top -log(p-value)", "Median of top -log(p-value)", "Percent of TFs annotated to a significant GO term (FDR < 0.05)", + "Number of TFs with > 1 annotated target gene" + )[to_plot] # this won't catch when correct amount but wrong selection of metrics if (length(label_text) != length(metric_dfs)) { @@ -73,35 +78,39 @@ plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perT for (m in 1:length(metric_dfs)) { # gather dfs into format usable by ggplot2 - gathered = tidyr::gather(metric_dfs[[m]], network, metric) + gathered <- tidyr::gather(metric_dfs[[m]], network, metric) gathered <- tidyr::separate_wider_regex(gathered, network, c(method = ".*", "_", size = ".*"), cols_remove = FALSE) - gathered$network = factor(gathered$network, levels=unique(gathered$network)) + gathered$network <- factor(gathered$network, levels = unique(gathered$network)) # split metrics based on whether they came from real or permuted data - real = gathered[!duplicated(gathered$network), ] + real <- gathered[!duplicated(gathered$network), ] real$size <- as.numeric(real$size) real$ltype <- "Original network" - permuted = gathered[duplicated(gathered$network), ] + permuted <- gathered[duplicated(gathered$network), ] permuted$size <- as.numeric(permuted$size) if (plot_with_boxes) { - plt <- ggplot2::ggplot(real, mapping=ggplot2::aes(x = size, y = metric, color = method, shape = method)) + plt <- ggplot2::ggplot(real, mapping = ggplot2::aes(x = size, y = metric, color = method, shape = method)) } else { - plt <- ggplot2::ggplot(real, mapping=ggplot2::aes(x = size, y = metric, color = method, shape = method, linetype = ltype)) + plt <- ggplot2::ggplot(real, mapping = ggplot2::aes(x = size, y = metric, color = method, shape = method, linetype = ltype)) } plt <- plt + ggplot2::geom_line(linewidth = 1) + ggplot2::geom_point(size = 2.5) + ggplot2::ggtitle(title_text, subtitle_text) + - ggplot2::guides(linetype = ggplot2::guide_legend(override.aes = list(size = 0.5)), - color = ggplot2::guide_legend(override.aes = list(size = 0.5))) + - ggplot2::theme(axis.line = ggplot2::element_line(colour = "black"), - panel.background = ggplot2::element_blank(), - plot.title=ggplot2::element_text(hjust=0.5), - plot.subtitle=ggplot2::element_text(hjust=0.5)) + + ggplot2::guides( + linetype = ggplot2::guide_legend(override.aes = list(size = 0.5)), + color = ggplot2::guide_legend(override.aes = list(size = 0.5)) + ) + + ggplot2::theme( + axis.line = ggplot2::element_line(colour = "black"), + panel.background = ggplot2::element_blank(), + plot.title = ggplot2::element_text(hjust = 0.5), + plot.subtitle = ggplot2::element_text(hjust = 0.5) + ) + ggplot2::scale_x_continuous(n.breaks = 10, trans = "log2") - # set lower limit of y axis to 0 if graphing a percent + # set lower limit of y axis to 0 if graphing a percent if (grepl("Percent", label_text[m])) { plt <- plt + ggplot2::scale_y_continuous(limits = c(0, NA)) } @@ -115,16 +124,16 @@ plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perT # if any of the networks whose metrics are being plotted has only one size, # plotting with box plots for the permuted networks looks better if (plot_with_boxes) { - plt <- plt + ggplot2::geom_boxplot(permuted, mapping=ggplot2::aes(x=size, y=metric, color = method, group = interaction(method, size))) + + plt <- plt + ggplot2::geom_boxplot(permuted, mapping = ggplot2::aes(x = size, y = metric, color = method, group = interaction(method, size))) + ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network", shape = "Network") } else { - medians = dplyr::summarise(group_by(permuted, network), median = median(metric)) + medians <- dplyr::summarise(group_by(permuted, network), median = median(metric)) medians <- tidyr::separate_wider_regex(medians, network, c(method = ".*", "_", size = ".*"), cols_remove = FALSE) medians$size <- as.numeric(medians$size) medians$ltype <- "Randomized network" - for(net in unique(permuted$method)) { - plt <- plt + ggplot2::geom_line(medians[medians$method == net,], mapping = ggplot2::aes(x = size, y = median, color = method, linetype = ltype), linewidth = 1.5) + for (net in unique(permuted$method)) { + plt <- plt + ggplot2::geom_line(medians[medians$method == net, ], mapping = ggplot2::aes(x = size, y = median, color = method, linetype = ltype), linewidth = 1.5) } plt <- plt + ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network", shape = "Network", linetype = "Data") } diff --git a/R/subset_network.R b/R/subset_network.R index 1836fb4..ffd9338 100644 --- a/R/subset_network.R +++ b/R/subset_network.R @@ -30,20 +30,41 @@ #' #' @export subset_network <- function(input_file, output_directory, name, edges, num_possible_TFs = 0) { + + # check existence and format of input_file + if(!file.exists(input_file)) { + stop("Network file not found.") + } + + net_first_line <- tryCatch({ + read.table(input_file, sep = "\t", nrows = 1) + }, error = function(e) { + stop("Error reading the network file.") + }) + + net_num_columns <- ncol(net_first_line) + if (net_num_columns != 2 && net_num_columns != 3) { + stop("Network file should contain three columns.") + } + + if(!is.character(name) || grepl(" ", name)) { + stop("Network name must be a string with no spaces.") + } + dir.create(output_directory, showWarnings = FALSE, recursive = TRUE) - network = read.table(file=input_file, sep='\t', header=FALSE) + network <- read.table(file = input_file, sep = "\t", header = FALSE) if (num_possible_TFs > 0) { - multiplier = num_possible_TFs + multiplier <- num_possible_TFs } else { - multiplier = 1 + multiplier <- 1 } # check that there is a third column # if not, don't subset if (length(colnames(network)) < 3) { - write.table(network, file.path(output_directory, paste0(name, "_", as.integer(length(network$V1)/multiplier), ".tsv")), quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE) + write.table(network, file.path(output_directory, paste0(name, "_", as.integer(length(network$V1) / multiplier), ".tsv")), quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE) return() } diff --git a/R/webgestalt_network.R b/R/webgestalt_network.R index 72f456a..582500a 100644 --- a/R/webgestalt_network.R +++ b/R/webgestalt_network.R @@ -41,96 +41,148 @@ #' #' @export webgestalt_network <- function(network_path, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", permutations = 10) { - METHOD="ORA" # ORA | GSEA | NTA + + # check existence and format of `network_path` + if(!file.exists(network_path)) { + stop("Network file not found.") + } + + net_first_line <- tryCatch({ + read.table(network_path, sep = "\t", nrows = 1) + }, error = function(e) { + stop("Error reading the network file.") + }) + + net_num_columns <- ncol(net_first_line) + if (net_num_columns != 2) { + stop("Network file should contain two columns.") + } + + # check existence and format of `reference_set` + if(!file.exists(reference_set)) { + stop("Reference set file not found.") + } + + ref_first_line <- tryCatch({ + read.table(reference_set, nrows = 1) + }, error = function(e) { + stop("Error reading the reference set file.") + }) + + ref_num_columns <- ncol(ref_first_line) + if (ref_num_columns != 1) { + stop("Reference set file should contain only one column.") + } + + # check format of `network_name` + if(!is.character(network_name) || grepl(" ", network_name)) { + stop("Network name must be a string with no spaces.") + } + + METHOD <- "ORA" # ORA | GSEA | NTA # reports are more in-depth than summaries - advisable to keep reports FALSE if not needed - REPORTS_PATH=output_directory # only used if GENERATE_REPORT=TRUE - GENERATE_REPORT=FALSE + REPORTS_PATH <- output_directory # only used if GENERATE_REPORT=TRUE + GENERATE_REPORT <- FALSE # path must exist even if GENERATE_REPORT=FALSE - dir.create(REPORTS_PATH, showWarnings = FALSE, recursive=TRUE) + dir.create(REPORTS_PATH, showWarnings = FALSE, recursive = TRUE) - for (i in 1:(permutations+1)) { - dir.create(file.path(output_directory, network_name, paste0("p",i-1)), showWarnings = FALSE, recursive=TRUE) + for (i in 1:(permutations + 1)) { + dir.create(file.path(output_directory, network_name, paste0("p", i - 1)), showWarnings = FALSE, recursive = TRUE) } if (file.exists(network_path)) { # create paths for the intermediary gene set files - work_dir = file.path(output_directory, "webgestalt_work", network_name) - for (i in 1:(permutations+1)) { - dir.create(file.path(work_dir, paste0("p",i-1)), showWarnings = FALSE, recursive=TRUE) + work_dir <- file.path(output_directory, "webgestalt_work", network_name) + for (i in 1:(permutations + 1)) { + dir.create(file.path(work_dir, paste0("p", i - 1)), showWarnings = FALSE, recursive = TRUE) } # finds the subset of genes in the reference set that are annotated to valid GO terms - ref_genes = WebGestaltR::idMapping(organism = organism, inputGene = read.table(reference_set)$V1, sourceIdType = gene_id, targetIdType = "entrezgene", host = "https://www.webgestalt.org/") - annotations = suppressWarnings(WebGestaltR::loadGeneSet(organism = organism, enrichDatabase = database)) - genes_per_term = table(annotations$geneSet$geneSet) - annotations_proper_size = annotations$geneSet[(genes_per_term[annotations$geneSet$geneSet] >= 10 & genes_per_term[annotations$geneSet$geneSet] <= 500),] + ref_genes <- WebGestaltR::idMapping(organism = organism, inputGene = read.table(reference_set)$V1, sourceIdType = gene_id, targetIdType = "entrezgene") + annotations <- suppressWarnings(WebGestaltR::loadGeneSet(organism = organism, enrichDatabase = database)) + genes_per_term <- table(annotations$geneSet$geneSet) + annotations_proper_size <- annotations$geneSet[(genes_per_term[annotations$geneSet$geneSet] >= 10 & genes_per_term[annotations$geneSet$geneSet] <= 500), ] rm(annotations) - annotated_ref_genes = ref_genes$mapped[ref_genes$mapped$entrezgene %in% annotations_proper_size$gene,] + annotated_ref_genes <- ref_genes$mapped[ref_genes$mapped$entrezgene %in% annotations_proper_size$gene, ] - network = read.table(file=network_path, sep='\t', header=FALSE) - annotated_network = network[(network$V1 %in% annotated_ref_genes$userId & network$V2 %in% annotated_ref_genes$userId),] + network <- read.table(file = network_path, sep = "\t", header = FALSE) + annotated_network <- network[(network$V1 %in% annotated_ref_genes$userId & network$V2 %in% annotated_ref_genes$userId), ] # returns a network with permuted TF and gene labels # conserves edge number distributions permute <- function(net_mat) { permuted_mat <- net_mat - rownames(permuted_mat) = sample(rownames(permuted_mat)) - colnames(permuted_mat) = sample(colnames(permuted_mat)) - permuted <- data.frame(V1=rep(row.names(permuted_mat), ncol(permuted_mat)), V2=rep(colnames(permuted_mat), each=nrow(permuted_mat)), V3=as.numeric(unlist(permuted_mat))) + rownames(permuted_mat) <- sample(rownames(permuted_mat)) + colnames(permuted_mat) <- sample(colnames(permuted_mat)) + permuted <- data.frame(V1 = rep(row.names(permuted_mat), ncol(permuted_mat)), V2 = rep(colnames(permuted_mat), each = nrow(permuted_mat)), V3 = as.numeric(unlist(permuted_mat))) return(permuted[permuted$V3 != 0, 1:2]) } unique_TFs <- unique(annotated_network$V1) - ann_network_matrix = xtabs(~ V1 + V2, annotated_network) + ann_network_matrix <- xtabs(~ V1 + V2, annotated_network) # separates each network subset into gene set files, calls the WebGestaltRBatch # function on that folder, and saves the results - for (i in 1:(permutations+1)) { + for (i in 1:(permutations + 1)) { if (i == 1) { net <- annotated_network } else { net <- permute(ann_network_matrix) } - valid_sets_counter = 0 + valid_sets_counter <- 0 for (tf in unique_TFs) { - gene_set = net$V2[net$V1 == tf] - entrez_gene_set = annotated_ref_genes$entrezgene[annotated_ref_genes$userId %in% gene_set] - present_terms = annotations_proper_size$geneSet[annotations_proper_size$gene %in% entrez_gene_set] + gene_set <- net$V2[net$V1 == tf] + entrez_gene_set <- annotated_ref_genes$entrezgene[annotated_ref_genes$userId %in% gene_set] + present_terms <- annotations_proper_size$geneSet[annotations_proper_size$gene %in% entrez_gene_set] if (length(gene_set) > 1 & length(unique(present_terms)) > 1) { - write.table(gene_set, file=file.path(work_dir,paste0("p",i-1),paste0(tf,"_targets.txt")), row.names=FALSE, col.names=FALSE) - valid_sets_counter = valid_sets_counter + 1 + write.table(gene_set, file = file.path(work_dir, paste0("p", i - 1), paste0(tf, "_targets.txt")), row.names = FALSE, col.names = FALSE) + valid_sets_counter <- valid_sets_counter + 1 } } # perform ORA - enrich_df <- WebGestaltR::WebGestaltRBatch( - enrichMethod = METHOD, - organism = organism, - enrichDatabase = database, - interestGeneFolder = file.path(work_dir,paste0("p",i-1)), - interestGeneType = gene_id, - referenceGene = annotated_ref_genes$userId, - referenceGeneType = gene_id, - minNum = 10, # default 10 - maxNum = 500, # default 500 - reportNum = 20, # default 20 - isOutput = GENERATE_REPORT, - outputDirectory = REPORTS_PATH, - projectName = paste0(network_name,"_p",i-1), - sigMethod = "top", - topThr = 16, # top 0.1% - isParallel = TRUE, - nThreads = ifelse(nzchar(Sys.getenv("_R_CHECK_LIMIT_CORES_", "")) && Sys.getenv("_R_CHECK_LIMIT_CORES_", "") == "TRUE", 2, parallelly::availableCores()), - hostName = "https://www.webgestalt.org/" + enrich_df <- tryCatch( + { + WebGestaltR::WebGestaltRBatch( + enrichMethod = METHOD, + organism = organism, + enrichDatabase = database, + interestGeneFolder = file.path(work_dir, paste0("p", i - 1)), + interestGeneType = gene_id, + referenceGene = annotated_ref_genes$userId, + referenceGeneType = gene_id, + minNum = 10, # default 10 + maxNum = 500, # default 500 + reportNum = 20, # default 20 + isOutput = GENERATE_REPORT, + outputDirectory = REPORTS_PATH, + projectName = paste0(network_name, "_p", i - 1), + sigMethod = "top", + topThr = 16, # roughly top 0.1% + isParallel = TRUE, + nThreads = ifelse(nzchar(Sys.getenv("_R_CHECK_LIMIT_CORES_", "")) && Sys.getenv("_R_CHECK_LIMIT_CORES_", "") == "TRUE", 2, parallelly::availableCores()) + ) + }, + error = function(msg) { + message(paste0(msg)) + message(paste("If the connection with www.webgestalt.org timed out,", + "try re-submitting your request. Their servers", + "sometimes get busy.")) + return(NA) + } ) - sig_counter = 0 + if (any(is.na(enrich_df))) { + return() + } + + sig_counter <- 0 for (set in enrich_df) { # get name from input - file_name = unlist(strsplit(set[[1]], '/'))[length(unlist(strsplit(set[[1]], '/')))] - no_ext = unlist(strsplit(file_name, '[.]'))[1] - tf_method = paste0(unlist(strsplit(no_ext, '_'))[1], '_', METHOD) + file_name <- unlist(strsplit(set[[1]], "/"))[length(unlist(strsplit(set[[1]], "/")))] + no_ext <- unlist(strsplit(file_name, "[.]"))[1] + tf_method <- paste0(unlist(strsplit(no_ext, "_"))[1], "_", METHOD) # save summary as .csv files if (!is.null(set[[2]])) { @@ -138,17 +190,18 @@ webgestalt_network <- function(network_path, reference_set, output_directory, ne sig_df <- subset(df, select = -c(link)) if (nrow(sig_df) > 0) { if (sig_df$FDR[1] < 0.05) { - sig_counter = sig_counter + 1 + sig_counter <- sig_counter + 1 } - sig_df['database'] <- rep(database, nrow(sig_df)) - write.csv(sig_df,file.path(output_directory,network_name,paste0("p",i-1),paste0(tf_method, "_summary.csv")),row.names = FALSE) + sig_df["database"] <- rep(database, nrow(sig_df)) + write.csv(sig_df, file.path(output_directory, network_name, paste0("p", i - 1), paste0(tf_method, "_summary.csv")), row.names = FALSE) } } } - write(paste0(sig_counter," significant\n",valid_sets_counter," valid\n",length(unique(network$V1))," total\n",length(network$V2)," edges"), - file = file.path(output_directory,network_name,paste0("p",i-1),"network_data.txt")) + write(paste0(sig_counter, " significant\n", valid_sets_counter, " valid\n", length(unique(network$V1)), " total\n", length(network$V2), " edges"), + file = file.path(output_directory, network_name, paste0("p", i - 1), "network_data.txt") + ) } } else { - print(paste(network_path,"must be an existing file.")) + message(paste(network_path, "must be an existing file.")) } } diff --git a/index.md b/index.md index 0675ec1..431699b 100644 --- a/index.md +++ b/index.md @@ -7,6 +7,6 @@ Welcome to GOeval - a package for gene regulatory network evaluation using Gene This package provides functions to systematically run over-representation analysis (ORA) on multiple subsets of a gene regulatory network and its permutations using the 'WebGestaltR' package. It also provides functions to assess the quality of biological information captured by the network via metrics calculated from the ORA results. -Here, you will find documentation on the purposes of the package's functions and their arguments, as well as a "Get started" vignette showing example usage of the package's functions. Navigate to the vignette by clicking either the "Get started" tab at the top of this page or https://westbrooktm.github.io/GOeval/articles/GOeval.html. +Here, you will find [Reference](https://westbrooktm.github.io/GOeval/reference/index.html) documentation on the purposes of the package's functions and their arguments, as well as a [Get started](https://westbrooktm.github.io/GOeval/articles/GOeval.html) vignette showing example usage of the package's functions. Input format: You will need a network file in which each row represents a directed edge. The tab-separated entries in each row are the the source node (e.g. transcription factor), the target node (e.g. the regulated gene), and optionally an edge score. The subset_network function requires edge scores. The webgestalt_network function requires a “gene universe” file that has a single column containing all the genes that could possibly be present in the network. diff --git a/data/vignette_data.tar.gz b/inst/vignette_data.tar.gz similarity index 100% rename from data/vignette_data.tar.gz rename to inst/vignette_data.tar.gz diff --git a/man/evaluate.Rd b/man/evaluate.Rd index 523b93e..7343d87 100644 --- a/man/evaluate.Rd +++ b/man/evaluate.Rd @@ -27,60 +27,75 @@ evaluate( ) } \arguments{ -\item{network}{path to the file containing the network to create subsets from. The file should -be tab-separated with three columns: source node, target node, edge score} +\item{network}{path to the file containing the network to create subsets +from. The file should be tab-separated with three columns: source node, +target node, edge score} -\item{reference_set}{path to the set of all genes possibly included in the network. Must be a .txt -file containing exactly one column of the genes that could possibly appear in the network.} +\item{reference_set}{path to the set of all genes possibly included in the +network. Must be a .txt file containing exactly one column of the genes that +could possibly appear in the network.} -\item{output_directory}{path to the directory in which to store the generated network subsets, -ORA summaries, and plots} +\item{output_directory}{path to the directory in which to store the generated +network subsets, ORA summaries, and plots} -\item{network_name}{short name for the network - used in file naming so may not contain spaces} +\item{network_name}{short name for the network - used in file naming so may +not contain spaces} \item{organism}{a string specifying the organism that the data is from, e.g. "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism()} -\item{database}{the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet(). -Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE.} +\item{database}{the gene set database to search for enrichment - see options +with WebGestaltR::listGeneSet(). Must be a Gene Ontology +"biological process" database if get_annotation_overlap = TRUE.} -\item{gene_id}{the naming system used for the input genes - see options with WebGestaltR::listIdType() -and see webgestalt.org for examples of each type} +\item{gene_id}{the naming system used for the input genes - see options with +WebGestaltR::listIdType() and see webgestalt.org for examples of each type} -\item{edges}{list of total numbers of edges or average edges per TF to include in each subset} +\item{edges}{list of total numbers of edges or average edges per TF to +include in each subset} -\item{num_possible_TFs}{if set to a number > 0, the elements of 'edges' will first be -multiplied by this number to get the number of edges for each subset} +\item{num_possible_TFs}{if set to a number > 0, the elements of 'edges' will +first be multiplied by this number to get the number of edges for each +subset} -\item{permutations}{the number of randomly permuted networks to create and run ORA on} +\item{permutations}{the number of randomly permuted networks to create and +run ORA on} -\item{penalty}{the penalty applied to the 'sum' metric for each TF in the network} +\item{penalty}{the penalty applied to the 'sum' metric for each TF in the +network} -\item{fdr_threshold}{the FDR threshold for a gene set term to be considered significantly -over-represented for the purposes of calculating the 'percent' metric} +\item{fdr_threshold}{the FDR threshold for a gene set term to be considered +significantly over-represented for the purposes of calculating the 'percent' +metric} -\item{get_sum}{boolean whether to get and plot the 'sum' metric, which is the sum of the negative -log base 10 of the p-value for the top term of each source node minus 'penalty' times the total -number of source nodes.} +\item{get_sum}{boolean whether to get and plot the 'sum' metric, which is the +sum of the negative log base 10 of the p-value for the top term of each +source node minus 'penalty' times the total number of source nodes.} -\item{get_percent}{boolean whether to get and plot the 'percent' metric, which is the -percent of source nodes with at least one term with a FDR below the 'fdr_threshold'} +\item{get_percent}{boolean whether to get and plot the 'percent' metric, +which is the percent of source nodes with at least one term with a FDR below +the 'fdr_threshold'} -\item{get_mean}{boolean whether to get and plot the 'mean' metric, which is the mean negative -log base 10 of the p-value for the top term of each source node regardless of significance} +\item{get_mean}{boolean whether to get and plot the 'mean' metric, which is +the mean negative log base 10 of the p-value for the top term of each source +node regardless of significance} -\item{get_median}{boolean whether to get and plot the 'median' metric, which is the median negative -log base 10 of the p-value for the top term of each source node regardless of significance} +\item{get_median}{boolean whether to get and plot the 'median' metric, which +is the median negative log base 10 of the p-value for the top term of each +source node regardless of significance} -\item{get_annotation_overlap}{boolean whether to get and plot the 'annotation_overlap' metric, -which is the percent of source nodes that are annotated to at least one of the 16 GO terms for -which their target genes are most enriched} +\item{get_annotation_overlap}{boolean whether to get and plot the +'annotation_overlap' metric, which is the percent of source nodes that are +annotated to at least one of the 16 GO terms for which their target genes +are most enriched} -\item{get_size}{boolean whether to get and plot the 'size' metric, which is the number of -source nodes in the network subset that have more than one target gene with annotations. This number is -used in the calculation of all other metrics.} +\item{get_size}{boolean whether to get and plot the 'size' metric, which is +the number of source nodes in the network subset that have more than one +target gene with annotations. This number is used in the calculation of all +other metrics.} -\item{plot}{boolean whether to make plots of the calculated metrics and write them to a pdf} +\item{plot}{boolean whether to make plots of the calculated metrics and write +them to a pdf} } \value{ output of \code{get_metrics}. Can be used as input to \code{plot_metrics}. @@ -88,9 +103,11 @@ output of \code{get_metrics}. Can be used as input to \code{plot_metrics}. \description{ Given one network, \code{evaluate} will first make subsets with \code{subset_network}, then run Over-Representation Analysis (ORA) with \code{webgestalt_network}, -followed by \code{get_metrics} and \code{plot_metrics} to plot selected summary statistics. +followed by \code{get_metrics} and \code{plot_metrics} to plot selected summary +statistics. } \details{ -The input file should be tab-separated with two or three columns: source node (e.g. transcription factor), -target node (e.g. the regulated gene), and, optionally, edge score. +The input file should be tab-separated with two or three columns: source node +(e.g. transcription factor), target node (e.g. the regulated gene), and, +optionally, edge score. } diff --git a/tests/testthat/test-evaluate.R b/tests/testthat/test-evaluate.R index ae76f90..5ec32df 100644 --- a/tests/testthat/test-evaluate.R +++ b/tests/testthat/test-evaluate.R @@ -1,5 +1,7 @@ test_that("evaluate runs on 10,000 edge network", { evaluate(file.path("data", "marbach_extract.tsv"), file.path("data", "marbach_NP_subset_universe.txt"), "data", - "mar", permutations = 1, get_annotation_overlap = TRUE) + "mar", + permutations = 1, get_annotation_overlap = TRUE + ) expect_equal(2 * 2, 4) }) diff --git a/tests/testthat/test-plot_metrics.R b/tests/testthat/test-plot_metrics.R index bd827dc..3b52a2e 100644 --- a/tests/testthat/test-plot_metrics.R +++ b/tests/testthat/test-plot_metrics.R @@ -1,11 +1,11 @@ test_that("get_metrics and plot_metrics run", { folders <- c(file.path("data", "np3_dtoTFs_summaries")) - title_text = "NP3" - subtitle_text = "subset for overlapping TFs with EDN" + title_text <- "NP3" + subtitle_text <- "subset for overlapping TFs with EDN" # call get_metrics - metric_dfs_by_net <- mapply(get_metrics, folders, MoreArgs=list(get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE), SIMPLIFY = FALSE) + metric_dfs_by_net <- mapply(get_metrics, folders, MoreArgs = list(get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE), SIMPLIFY = FALSE) expect_equal(length(metric_dfs_by_net[[1]]), 6) @@ -22,11 +22,11 @@ test_that("get_metrics and plot_metrics run", { test_that("get_metrics and plot_metrics run with single subset network", { folders <- c(file.path("data", "np3_dtoTFs_summaries"), file.path("data", "EDN_summaries")) - title_text = "NP3 vs EDN" - subtitle_text = "subset for overlapping TFs" + title_text <- "NP3 vs EDN" + subtitle_text <- "subset for overlapping TFs" # call get_metrics - metric_dfs_by_net <- mapply(get_metrics, folders, MoreArgs=list(get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE), SIMPLIFY = FALSE) + metric_dfs_by_net <- mapply(get_metrics, folders, MoreArgs = list(get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE), SIMPLIFY = FALSE) expect_equal(length(metric_dfs_by_net[[1]]), 6) diff --git a/vignettes/GOeval.Rmd b/vignettes/GOeval.Rmd index d097ceb..ccdcd6a 100644 --- a/vignettes/GOeval.Rmd +++ b/vignettes/GOeval.Rmd @@ -17,9 +17,9 @@ knitr::opts_chunk$set( First, install and load this package. It is easiest to use either the "devtools" or "remotes" package to do so. "remotes" is a smaller package, so it might be the better option if you do not want to install all of "devtools." ```{r, eval = FALSE} -devtools::install_github("westbrooktm/GOeval", ref = "dev") +devtools::install_github("westbrooktm/GOeval") # OR -remotes::install_github("westbrooktm/GOeval", ref = "dev") +remotes::install_github("westbrooktm/GOeval") ``` ```{r setup} @@ -31,7 +31,7 @@ Now that the package is loaded, we need a network as a tab-separated .tsv file w To run the following examples, access the example data stored in "vignette_data.tar.gz" and extract it into a local folder. ```{r, eval = FALSE} -data_path <- system.file(package = "GOeval", file.path("data", "vignette_data.tar.gz")) +data_path <- system.file(package = "GOeval", file.path("vignette_data.tar.gz")) # wherever you want to store the example data data_dir <- "GOeval_data" @@ -67,15 +67,16 @@ Run the `evaluate` function. ```{r, eval = FALSE} net_1_data <- evaluate(network_path, reference_path, out_dir, network_name, - # some additional arguments - edges = c(512, 1024, 2048, 3174, 4096, 8192, 16384, 32768, 65536), - permutations = 1, - get_annotation_overlap = TRUE) + # some additional arguments + edges = c(512, 1024, 2048, 3174, 4096, 8192, 16384, 32768, 65536), + permutations = 1, + get_annotation_overlap = TRUE +) ``` It is possible to customize the behavior of the `evaluate` function by using arguments in addition to the four required ones. In the above example, the 'edges' argument modifies the sizes of subsets to analyze from the input network. "permutations = 1" allows a faster run time than the default "permutations = 3" at the cost of running the analyses on fewer permuted versions of the input network. "get_annotation_overlap = TRUE" indicates that the "annotation_overlap" metric should be calculated. -The available arguments for all functions can be found at the "Reference" tab at the top of this page or https://westbrooktm.github.io/GOeval/reference/index.html. +The available arguments for all functions can be found at the [Reference](https://westbrooktm.github.io/GOeval/reference/index.html) tab at the top of this page. After `evaluate` runs for a couple minutes, the directory at out_dir should contain a folder of subsets of the original network, a folder that contains the ORA summaries and sets of target genes, and a .pdf file with plots of metrics calculated from those summaries. The evaluate function returns the data used to make the plots as a list of data.frames. @@ -85,13 +86,14 @@ Now, let us compare with a second network. The "small_dto.tsv" network does not ```{r, eval = FALSE} net_2_data <- evaluate(file.path(data_dir, "small_dto.tsv"), - file.path(data_dir, "h1_k562_overlap_universe.txt"), - out_dir, - "dto", - permutations = 1, get_annotation_overlap = TRUE, - # without multiple subsets, the plot would just be one dot - # so we can tell it not to bother - plot = FALSE) + file.path(data_dir, "h1_k562_overlap_universe.txt"), + out_dir, + "dto", + permutations = 1, get_annotation_overlap = TRUE, + # without multiple subsets, the plot would just be one dot + # so we can tell it not to bother + plot = FALSE +) ``` Use the output from both networks in the call to `plot_metrics`. @@ -137,17 +139,16 @@ Advanced usage: All intermediary steps in the evaluation can be run separately with the following functions. -1. `subset_network` to create network subsets with the top-weighted edges +1. `subset_network` to create network subsets with the top-weighted edges -2. `webgestalt_network` on each of the network subsets to generate ORA results +2. `webgestalt_network` on each of the network subsets to generate ORA results -3. `get_metrics` on the stored ORA results to calculate summary statistics for plotting +3. `get_metrics` on the stored ORA results to calculate summary statistics for plotting -4. `plot_metrics` using the output of get_metrics -\ -\ -\ -We will now walk through step-by-step evaluation of the network "np3_dtoTFs.tsv", which has 56 TFs and a total of 629,832 edges. +4. `plot_metrics` using the output of get_metrics\ + \ + \ + We will now walk through step-by-step evaluation of the network "np3_dtoTFs.tsv", which has 56 TFs and a total of 629,832 edges. 1\. `subset_network` @@ -156,30 +157,27 @@ We will now walk through step-by-step evaluation of the network "np3_dtoTFs.tsv" This function sorts the input network in descending order by the values in the third column and creates files in the output_folder with only the first two columns of the rows that have the highest scores. ```{r} - input_file <- file.path(data_dir, "np3_dtoTFs.tsv") - - output_directory <- file.path(out_dir, "np3_dtoTFs_subsets") - - name <- "np3" - - # Since we are setting num_possible_TFs, this will be the average number of - # edges from each TF in each subset. - edges <- c(8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096) - - # The "np3_dtoTFs.tsv" network was made to include only 56 TFs. - num_possible_TFs <- 56 - - subset_network(input_file, output_directory, name, edges, num_possible_TFs) +input_file <- file.path(data_dir, "np3_dtoTFs.tsv") + +output_directory <- file.path(out_dir, "np3_dtoTFs_subsets") + +name <- "np3" + +# Since we are setting num_possible_TFs, this will be the average number of +# edges from each TF in each subset. +edges <- c(8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096) + +# The "np3_dtoTFs.tsv" network was made to include only 56 TFs. +num_possible_TFs <- 56 + +subset_network(input_file, output_directory, name, edges, num_possible_TFs) ``` The result is that now the "data/np3_dtoTFs_subsets" folder contains 10 files: "np3_8.tsv" through "np3_4096.tsv". -2. `webgestalt_network` +2. `webgestalt_network` -Run ORA using the `webgestalt_network` function on each of the network subsets created in step 1. -These files are tab-separated with the first column containing the TF names and the second column containing the regulated gene names. -"h1_k562_overlap_universe.txt" contains one column of the names of all genes that could appear in the network. -The network_name should be different for each subset, so I choose to use the subset file names without the extension. +Run ORA using the `webgestalt_network` function on each of the network subsets created in step 1. These files are tab-separated with the first column containing the TF names and the second column containing the regulated gene names. "h1_k562_overlap_universe.txt" contains one column of the names of all genes that could appear in the network. The network_name should be different for each subset, so I choose to use the subset file names without the extension. Note organism = "hsapiens" and database = "geneontology_Biological_Process_noRedundant" is the default database for ORA. @@ -189,49 +187,53 @@ Also note that the run time of the below code could take well over an hour, so t ```{r, eval = FALSE} for (subset in list.files(file.path(data_dir, "np3_dtoTFs_subsets"), full.names = TRUE)) { - webgestalt_network(network_path = subset, - reference_set = file.path(data_dir, "h1_k562_overlap_universe.txt"), - output_directory = file.path(out_dir, "np3_dtoTFs_summaries"), - # this just gets the name of each file minus the extension - network_name = strsplit(basename(subset), "[.]")[[1]][1], - permutations = 3) + webgestalt_network( + network_path = subset, + reference_set = file.path(data_dir, "h1_k562_overlap_universe.txt"), + output_directory = file.path(out_dir, "np3_dtoTFs_summaries"), + # this just gets the name of each file minus the extension + network_name = strsplit(basename(subset), "[.]")[[1]][1], + permutations = 3 + ) } ``` To test the very basics on a personal computer, the following code should run in just a few minutes. ```{r, eval = FALSE} -webgestalt_network(network_path = file.path(data_dir, "np3_dtoTFs_subsets", "np3_8.tsv"), - reference_set = file.path(data_dir, "h1_k562_overlap_universe.txt"), - output_directory = file.path(out_dir, "np3_dtoTFs_single_permutation"), - network_name = "np3_8", - permutations = 1) +webgestalt_network( + network_path = file.path(data_dir, "np3_dtoTFs_subsets", "np3_8.tsv"), + reference_set = file.path(data_dir, "h1_k562_overlap_universe.txt"), + output_directory = file.path(out_dir, "np3_dtoTFs_single_permutation"), + network_name = "np3_8", + permutations = 1 +) ``` -3. `get_metrics` +3. `get_metrics` Run `get_metrics` to calculate the desired metrics based on the output from `webgestalt_network`. Note the arguments 'get_sum' and 'get_size' are TRUE by default. ```{r} - # This would normally be in the 'out_dir' directory, but since this data was - # premade due to computation time constraints, it is available in 'data_dir' - output_folders <- file.path(data_dir, "np3_dtoTFs_summaries") +# This would normally be in the 'out_dir' directory, but since this data was +# premade due to computation time constraints, it is available in 'data_dir' +output_folders <- file.path(data_dir, "np3_dtoTFs_summaries") - metric_dfs <- get_metrics(output_folders, get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE) +metric_dfs <- get_metrics(output_folders, get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE) ``` If you want to compare multiple networks, you can run `get_metrics` on both at the same time as shown below with the addition of the "EDN" network. When comparing multiple networks, the sizes for all must be based either on total edges or on edges per TF, but the exact subset sizes do not need to be the same. ```{r, eval = FALSE} - # Specify which folders contain the data for each network. - # Each element will be a folder used as the output folder for a call to webgestalt_network. - # Again, these are normally in 'out_dir' but here they are in 'data_dir' for convenience - output_folders <- c(file.path(data_dir, "np3_dtoTFs_summaries"), file.path(data_dir, "EDN_summaries")) +# Specify which folders contain the data for each network. +# Each element will be a folder used as the output folder for a call to webgestalt_network. +# Again, these are normally in 'out_dir' but here they are in 'data_dir' for convenience +output_folders <- c(file.path(data_dir, "np3_dtoTFs_summaries"), file.path(data_dir, "EDN_summaries")) - metric_dfs_by_net <- mapply(get_metrics, output_folders, MoreArgs=list(get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE), SIMPLIFY = FALSE) +metric_dfs_by_net <- mapply(get_metrics, output_folders, MoreArgs = list(get_percent = TRUE, get_mean = TRUE, get_median = TRUE, get_annotation_overlap = TRUE, parallel = FALSE), SIMPLIFY = FALSE) ``` -4. `plot_metrics` +4. `plot_metrics` Set a title and subtitle for the plots.\ Specify whether the subset sizes are based on total edges or edges per TF.\ @@ -240,11 +242,11 @@ Set the metric booleans to the same values as those used in `get_metrics` to ens `plot_metrics` also returns a list of the data.frames used to plot. Each data.frame contains values for one metric across all networks, subsets, and permutations. ```{r} - # Specify the main titles of the output graphs - title_text <- "NP3" - subtitle_text <- "subset for TFs accepted by DTO" - - perTF <- TRUE - - plot_data <- plot_metrics(metric_dfs, title_text, subtitle_text, perTF, percent = TRUE, mean = TRUE, median = TRUE, annotation_overlap = TRUE) +# Specify the main titles of the output graphs +title_text <- "NP3" +subtitle_text <- "subset for TFs accepted by DTO" + +perTF <- TRUE + +plot_data <- plot_metrics(metric_dfs, title_text, subtitle_text, perTF, percent = TRUE, mean = TRUE, median = TRUE, annotation_overlap = TRUE) ```