vignette updates and fix to plot_metrics

BrentLab · Jul 7, 2023 · e15b7df · e15b7df
1 parent 9693c2d
commit e15b7df
Show file tree

Hide file tree

Showing 10 changed files with 124 additions and 59 deletions.
diff --git a/R/evaluate.R b/R/evaluate.R
@@ -13,6 +13,12 @@
 #' @param output_directory path to the directory in which to store the generated network subsets,
 #'  ORA summaries, and plots
 #' @param network_name short name for the network - used in file naming so may not contain spaces
+#' @param organism a string specifying the organism that the data is from, e.g.
+#'    "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism()
+#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet().
+#'  Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE.
+#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
+#'  and see webgestalt.org for examples of each type
 #' @param edges list of total numbers of edges or average edges per TF to include in each subset
 #' @param num_possible_TFs if set to a number > 0, the elements of 'edges' will first be
 #'  multiplied by this number to get the number of edges for each subset
@@ -39,13 +45,14 @@
 #'
 #' @details
 #' The input file should be tab-separated with two or three columns: source node (e.g. transcription factor),
-#'  target node (e.g. the regulated gene), and, optionally, edge score. All genes must be written as Ensembl gene IDs.
+#'  target node (e.g. the regulated gene), and, optionally, edge score.
 #'
 #' @return output of `get_metrics`. Can be used as input to `plot_metrics`.
 #'
 #' @export
-evaluate <- function(network, reference_set, output_directory, network_name, edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0,
-                     permutations = 3, penalty = 3, fdr_threshold = 0.05, get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) {
+evaluate <- function(network, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id",
+                     edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0, permutations = 3, penalty = 3, fdr_threshold = 0.05,
+                     get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) {
 
   # first package function
   subset_network(network, file.path(output_directory, paste0(network_name, "_subsets")), network_name, edges, num_possible_TFs)
@@ -62,12 +69,15 @@ evaluate <- function(network, reference_set, output_directory, network_name, edg
                        output_directory = file.path(output_directory, paste0(network_name, summaries_suffix)),
                        # this just gets the name of each file minus the extension
                        network_name = strsplit(basename(subset), "[.]")[[1]][1],
+                       organism = organism,
+                       database = database,
+                       gene_id = gene_id,
                        permutations = permutations)
   }
 
   # third package function
   # mapply returns a list of length 1 containing the output of get_metrics
-  metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, summaries_suffix)), MoreArgs=list(get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, penalty = penalty, fdr_threshold = fdr_threshold, parallel = FALSE), SIMPLIFY = FALSE)
+  metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, summaries_suffix)), MoreArgs=list(organism = organism, database = database, gene_id = gene_id, get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, penalty = penalty, fdr_threshold = fdr_threshold, parallel = FALSE), SIMPLIFY = FALSE)
 
   if (plot) {
     pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots_", formatted_time, ".pdf")), 7, 5)

diff --git a/R/get_metrics.R b/R/get_metrics.R
@@ -10,6 +10,8 @@
 #'    summaries output by `webgestalt_network`
 #' @param organism a string specifying the organism that the data is from, e.g.
 #'    "hsapiens" or "scerevisiae"
+#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
+#'  and see webgestalt.org for examples of each type
 #' @param go_ann a data.frame of annotations of genes to GO terms. Obtain with
 #'  WebGestaltR::loadGeneSet.
 #' @param go_reg a data.frame of the regulatory relationships between GO terms.
@@ -18,11 +20,10 @@
 #' @return a list of source nodes
 #'
 #' @keywords internal
-get_annotation_overlap <- function(terms, organism, go_ann, go_reg) {
+get_annotation_overlap <- function(terms, organism, gene_id, go_ann, go_reg) {
   tf_ids <- unique(terms$tfId)
 
-  # assumption of "ensembl_gene_id"
-  tf_map <- WebGestaltR::idMapping(organism = organism, inputGene = tf_ids, sourceIdType = "ensembl_gene_id", targetIdType = "entrezgene", host = "https://www.webgestalt.org/")
+  tf_map <- WebGestaltR::idMapping(organism = organism, inputGene = tf_ids, sourceIdType = gene_id, targetIdType = "entrezgene", host = "https://www.webgestalt.org/")
 
   overlap_list <- list()
   for (tf in tf_ids) {
@@ -63,7 +64,9 @@ get_annotation_overlap <- function(terms, organism, go_ann, go_reg) {
 #'  Use `get_terms` to obtain.
 #' @param network_size the number of source nodes in the network
 #' @param organism a string specifying the organism that the data is from, e.g.
-#'  "hsapiens" or "scerevisiae". Only required if get_annotation_overlap = TRUE.
+#'  "hsapiens" or "scerevisiae"
+#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
+#'  and see webgestalt.org for examples of each type
 #' @param get_sum boolean whether to get the 'sum' metric, which is the sum of the negative
 #'  log base 10 of the p-value for the top term of each source node minus 'penalty' times the total
 #'  number of source nodes.
@@ -90,7 +93,7 @@ get_annotation_overlap <- function(terms, organism, go_ann, go_reg) {
 #' @return a list of metric values
 #'
 #' @keywords internal
-get_network_metrics <- function(full_terms, network_size, organism, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, go_ann = NULL, go_reg = NULL) {
+get_network_metrics <- function(full_terms, network_size, organism, gene_id, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, go_ann = NULL, go_reg = NULL) {
   if (!any(is.na(full_terms))) {
     terms <- full_terms[match(unique(full_terms$tfId), full_terms$tfId), ]
     percent <- 100 * sum(terms$FDR < fdr_threshold) / network_size
@@ -99,7 +102,7 @@ get_network_metrics <- function(full_terms, network_size, organism, get_sum, get
     neglogp <- ifelse(is.finite(neglogp), neglogp, 16)
     #neglogp_zeros <- append(rep(0, network_size - length(terms$geneSet)), neglogp)
     if (get_annotation_overlap) {
-      prior_ann <- 100 * length(get_annotation_overlap(full_terms, organism, go_ann = go_ann, go_reg = go_reg)) / network_size
+      prior_ann <- 100 * length(get_annotation_overlap(full_terms, organism, gene_id, go_ann = go_ann, go_reg = go_reg)) / network_size
     } else {
       prior_ann <- 0
     }
@@ -132,7 +135,11 @@ get_network_metrics <- function(full_terms, network_size, organism, get_sum, get
 #' @param directory a directory containing only the directories of ORA summaries created by
 #'  `webgestalt_network` for all networks of interest
 #' @param organism a string specifying the organism that the data is from, e.g.
-#'    "hsapiens" or "scerevisiae". Only required if get_annotation_overlap = TRUE.
+#'  "hsapiens" or "scerevisiae". Only required if get_annotation_overlap = TRUE.
+#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet().
+#'  Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE.
+#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
+#'  and see webgestalt.org for examples of each type. Only required if get_annotation_overlap = TRUE.
 #' @param get_sum boolean whether to get the 'sum' metric, which is the sum of the negative
 #'  log base 10 of the p-value for the top term of each source node minus 'penalty' times the total
 #'  number of source nodes.
@@ -159,7 +166,7 @@ get_network_metrics <- function(full_terms, network_size, organism, get_sum, get
 #'  represent the different network permutations. The first row is from the unpermuted networks.
 #'
 #' @export
-get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, penalty = 3, fdr_threshold = 0.05, parallel = FALSE) {
+get_metrics <- function(directory, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, penalty = 3, fdr_threshold = 0.05, parallel = FALSE) {
   # GO regulatory relationships only needed if get_annotation_overlap = TRUE
   if (get_annotation_overlap) {
     # Load regulatory relationships between GO terms for the calculation of overlap between TF GO
@@ -170,7 +177,7 @@ get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_pe
     rm(go)
 
     # Load gene annotations with WebGestaltR::loadGeneSet.
-    suppressWarnings(go_ann <- WebGestaltR::loadGeneSet(organism = "hsapiens", enrichDatabase = "geneontology_Biological_Process_noRedundant", hostName = "https://www.webgestalt.org/")$geneSet)
+    suppressWarnings(go_ann <- WebGestaltR::loadGeneSet(organism = organism, enrichDatabase = database, hostName = "https://www.webgestalt.org/")$geneSet)
   } else {
     go_reg <- NULL
     go_ann <- NULL
@@ -199,7 +206,7 @@ get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_pe
       })
       paths <- list.dirs(net, full.names = TRUE, recursive = FALSE)
       p_terms <- mapply(get_terms, paths, 16, SIMPLIFY = FALSE)
-      return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
+      return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, gene_id, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
     }, mc.cores = parallelly::availableCores())
   } else {
     # non-parallel version
@@ -211,7 +218,7 @@ get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_pe
       })
       paths <- list.dirs(net, full.names = TRUE, recursive = FALSE)
       p_terms <- mapply(get_terms, paths, 16, SIMPLIFY = FALSE)
-      return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
+      return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, gene_id, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
     })
   }
   df_list <- list()

diff --git a/R/plot_metrics.R b/R/plot_metrics.R
@@ -115,8 +115,8 @@ plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perT
     # if any of the networks whose metrics are being plotted has only one size,
     # plotting with box plots for the permuted networks looks better
     if (plot_with_boxes) {
-      plt <- plt + ggplot2::geom_boxplot(permuted, mapping=ggplot2::aes(x=size, y=metric, color = method, group = size)) +
-        ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network name", shape = "Network name")
+      plt <- plt + ggplot2::geom_boxplot(permuted, mapping=ggplot2::aes(x=size, y=metric, color = method, group = interaction(method, size))) +
+        ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network", shape = "Network")
     } else {
       medians = dplyr::summarise(group_by(permuted, network), median = median(metric))
       medians <- tidyr::separate_wider_regex(medians, network, c(method = ".*", "_", size = ".*"), cols_remove = FALSE)
@@ -126,7 +126,7 @@ plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perT
       for(net in unique(permuted$method)) {
         plt <- plt + ggplot2::geom_line(medians[medians$method == net,], mapping = ggplot2::aes(x = size, y = median, color = method, linetype = ltype), linewidth = 1.5)
       }
-      plt <- plt + ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network name", shape = "Network name", linetype = "Data")
+      plt <- plt + ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network", shape = "Network", linetype = "Data")
     }
     plot(plt)
   }

diff --git a/R/webgestalt_network.R b/R/webgestalt_network.R
@@ -26,23 +26,24 @@
 #'  directory for the output from all network subsets of a single original network.
 #' @param network_name the name of the directory to store the results within the output_directory.
 #'  It should be in the same format as the files output by `subset_network`: "\{name\}_\{# of edges\}" i.e. "example_8".
-#' @param organism human: "hsapiens"; yeast: "scerevisiae"
+#' @param organism a string specifying the organism that the data is from, e.g.
+#'    "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism()
 #' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet()
+#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
+#'  and see webgestalt.org for examples of each type
 #' @param permutations the number of randomly permuted networks to create and run ORA on
 #'
 #' @details
 #' The input network file should be a tab-separated .tsv where the first column is the
 #'  source nodes (TFs) and the second column is the target nodes (regulated genes).
-#' All genes must be written as Ensembl gene IDs.
 #'
 #' @return NULL
 #'
 #' @export
-webgestalt_network <- function(network_path, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", permutations = 10) {
+webgestalt_network <- function(network_path, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", permutations = 10) {
   METHOD="ORA" # ORA | GSEA | NTA
-  GENE_ID="ensembl_gene_id" # see options with listIdType() - set to ensembl_gene_id for now
   # reports are more in-depth than summaries - advisable to keep reports FALSE if not needed
-  REPORTS_PATH=output_directory#"GO_results" # only used if GENERATE_REPORT=TRUE
+  REPORTS_PATH=output_directory # only used if GENERATE_REPORT=TRUE
   GENERATE_REPORT=FALSE
 
   # path must exist even if GENERATE_REPORT=FALSE
@@ -60,7 +61,7 @@ webgestalt_network <- function(network_path, reference_set, output_directory, ne
     }
 
     # finds the subset of genes in the reference set that are annotated to valid GO terms
-    ref_genes = WebGestaltR::idMapping(organism = organism, inputGene = read.table(reference_set)$V1, sourceIdType = GENE_ID, targetIdType = "entrezgene", host = "https://www.webgestalt.org/")
+    ref_genes = WebGestaltR::idMapping(organism = organism, inputGene = read.table(reference_set)$V1, sourceIdType = gene_id, targetIdType = "entrezgene", host = "https://www.webgestalt.org/")
     annotations = suppressWarnings(WebGestaltR::loadGeneSet(organism = organism, enrichDatabase = database))
     genes_per_term = table(annotations$geneSet$geneSet)
     annotations_proper_size = annotations$geneSet[(genes_per_term[annotations$geneSet$geneSet] >= 10 & genes_per_term[annotations$geneSet$geneSet] <= 500),]
@@ -108,9 +109,9 @@ webgestalt_network <- function(network_path, reference_set, output_directory, ne
         organism = organism,
         enrichDatabase = database,
         interestGeneFolder = file.path(work_dir,paste0("p",i-1)),
-        interestGeneType = GENE_ID,
+        interestGeneType = gene_id,
         referenceGene = annotated_ref_genes$userId,
-        referenceGeneType = GENE_ID,
+        referenceGeneType = gene_id,
         minNum = 10, # default 10
         maxNum = 500, # default 500
         reportNum = 20, # default 20

diff --git a/man/evaluate.Rd b/man/evaluate.Rd
diff --git a/man/get_annotation_overlap.Rd b/man/get_annotation_overlap.Rd
diff --git a/man/get_metrics.Rd b/man/get_metrics.Rd