Skip to content

Commit

Permalink
vignette updates and fix to plot_metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
westbrooktm committed Jul 7, 2023
1 parent 9693c2d commit e15b7df
Show file tree
Hide file tree
Showing 10 changed files with 124 additions and 59 deletions.
18 changes: 14 additions & 4 deletions R/evaluate.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
#' @param output_directory path to the directory in which to store the generated network subsets,
#' ORA summaries, and plots
#' @param network_name short name for the network - used in file naming so may not contain spaces
#' @param organism a string specifying the organism that the data is from, e.g.
#' "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism()
#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet().
#' Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE.
#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
#' and see webgestalt.org for examples of each type
#' @param edges list of total numbers of edges or average edges per TF to include in each subset
#' @param num_possible_TFs if set to a number > 0, the elements of 'edges' will first be
#' multiplied by this number to get the number of edges for each subset
Expand All @@ -39,13 +45,14 @@
#'
#' @details
#' The input file should be tab-separated with two or three columns: source node (e.g. transcription factor),
#' target node (e.g. the regulated gene), and, optionally, edge score. All genes must be written as Ensembl gene IDs.
#' target node (e.g. the regulated gene), and, optionally, edge score.
#'
#' @return output of `get_metrics`. Can be used as input to `plot_metrics`.
#'
#' @export
evaluate <- function(network, reference_set, output_directory, network_name, edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0,
permutations = 3, penalty = 3, fdr_threshold = 0.05, get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) {
evaluate <- function(network, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id",
edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0, permutations = 3, penalty = 3, fdr_threshold = 0.05,
get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) {

# first package function
subset_network(network, file.path(output_directory, paste0(network_name, "_subsets")), network_name, edges, num_possible_TFs)
Expand All @@ -62,12 +69,15 @@ evaluate <- function(network, reference_set, output_directory, network_name, edg
output_directory = file.path(output_directory, paste0(network_name, summaries_suffix)),
# this just gets the name of each file minus the extension
network_name = strsplit(basename(subset), "[.]")[[1]][1],
organism = organism,
database = database,
gene_id = gene_id,
permutations = permutations)
}

# third package function
# mapply returns a list of length 1 containing the output of get_metrics
metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, summaries_suffix)), MoreArgs=list(get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, penalty = penalty, fdr_threshold = fdr_threshold, parallel = FALSE), SIMPLIFY = FALSE)
metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, summaries_suffix)), MoreArgs=list(organism = organism, database = database, gene_id = gene_id, get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, penalty = penalty, fdr_threshold = fdr_threshold, parallel = FALSE), SIMPLIFY = FALSE)

if (plot) {
pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots_", formatted_time, ".pdf")), 7, 5)
Expand Down
29 changes: 18 additions & 11 deletions R/get_metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#' summaries output by `webgestalt_network`
#' @param organism a string specifying the organism that the data is from, e.g.
#' "hsapiens" or "scerevisiae"
#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
#' and see webgestalt.org for examples of each type
#' @param go_ann a data.frame of annotations of genes to GO terms. Obtain with
#' WebGestaltR::loadGeneSet.
#' @param go_reg a data.frame of the regulatory relationships between GO terms.
Expand All @@ -18,11 +20,10 @@
#' @return a list of source nodes
#'
#' @keywords internal
get_annotation_overlap <- function(terms, organism, go_ann, go_reg) {
get_annotation_overlap <- function(terms, organism, gene_id, go_ann, go_reg) {
tf_ids <- unique(terms$tfId)

# assumption of "ensembl_gene_id"
tf_map <- WebGestaltR::idMapping(organism = organism, inputGene = tf_ids, sourceIdType = "ensembl_gene_id", targetIdType = "entrezgene", host = "https://www.webgestalt.org/")
tf_map <- WebGestaltR::idMapping(organism = organism, inputGene = tf_ids, sourceIdType = gene_id, targetIdType = "entrezgene", host = "https://www.webgestalt.org/")

overlap_list <- list()
for (tf in tf_ids) {
Expand Down Expand Up @@ -63,7 +64,9 @@ get_annotation_overlap <- function(terms, organism, go_ann, go_reg) {
#' Use `get_terms` to obtain.
#' @param network_size the number of source nodes in the network
#' @param organism a string specifying the organism that the data is from, e.g.
#' "hsapiens" or "scerevisiae". Only required if get_annotation_overlap = TRUE.
#' "hsapiens" or "scerevisiae"
#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
#' and see webgestalt.org for examples of each type
#' @param get_sum boolean whether to get the 'sum' metric, which is the sum of the negative
#' log base 10 of the p-value for the top term of each source node minus 'penalty' times the total
#' number of source nodes.
Expand All @@ -90,7 +93,7 @@ get_annotation_overlap <- function(terms, organism, go_ann, go_reg) {
#' @return a list of metric values
#'
#' @keywords internal
get_network_metrics <- function(full_terms, network_size, organism, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, go_ann = NULL, go_reg = NULL) {
get_network_metrics <- function(full_terms, network_size, organism, gene_id, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, go_ann = NULL, go_reg = NULL) {
if (!any(is.na(full_terms))) {
terms <- full_terms[match(unique(full_terms$tfId), full_terms$tfId), ]
percent <- 100 * sum(terms$FDR < fdr_threshold) / network_size
Expand All @@ -99,7 +102,7 @@ get_network_metrics <- function(full_terms, network_size, organism, get_sum, get
neglogp <- ifelse(is.finite(neglogp), neglogp, 16)
#neglogp_zeros <- append(rep(0, network_size - length(terms$geneSet)), neglogp)
if (get_annotation_overlap) {
prior_ann <- 100 * length(get_annotation_overlap(full_terms, organism, go_ann = go_ann, go_reg = go_reg)) / network_size
prior_ann <- 100 * length(get_annotation_overlap(full_terms, organism, gene_id, go_ann = go_ann, go_reg = go_reg)) / network_size
} else {
prior_ann <- 0
}
Expand Down Expand Up @@ -132,7 +135,11 @@ get_network_metrics <- function(full_terms, network_size, organism, get_sum, get
#' @param directory a directory containing only the directories of ORA summaries created by
#' `webgestalt_network` for all networks of interest
#' @param organism a string specifying the organism that the data is from, e.g.
#' "hsapiens" or "scerevisiae". Only required if get_annotation_overlap = TRUE.
#' "hsapiens" or "scerevisiae". Only required if get_annotation_overlap = TRUE.
#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet().
#' Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE.
#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
#' and see webgestalt.org for examples of each type. Only required if get_annotation_overlap = TRUE.
#' @param get_sum boolean whether to get the 'sum' metric, which is the sum of the negative
#' log base 10 of the p-value for the top term of each source node minus 'penalty' times the total
#' number of source nodes.
Expand All @@ -159,7 +166,7 @@ get_network_metrics <- function(full_terms, network_size, organism, get_sum, get
#' represent the different network permutations. The first row is from the unpermuted networks.
#'
#' @export
get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, penalty = 3, fdr_threshold = 0.05, parallel = FALSE) {
get_metrics <- function(directory, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, penalty = 3, fdr_threshold = 0.05, parallel = FALSE) {
# GO regulatory relationships only needed if get_annotation_overlap = TRUE
if (get_annotation_overlap) {
# Load regulatory relationships between GO terms for the calculation of overlap between TF GO
Expand All @@ -170,7 +177,7 @@ get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_pe
rm(go)

# Load gene annotations with WebGestaltR::loadGeneSet.
suppressWarnings(go_ann <- WebGestaltR::loadGeneSet(organism = "hsapiens", enrichDatabase = "geneontology_Biological_Process_noRedundant", hostName = "https://www.webgestalt.org/")$geneSet)
suppressWarnings(go_ann <- WebGestaltR::loadGeneSet(organism = organism, enrichDatabase = database, hostName = "https://www.webgestalt.org/")$geneSet)
} else {
go_reg <- NULL
go_ann <- NULL
Expand Down Expand Up @@ -199,7 +206,7 @@ get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_pe
})
paths <- list.dirs(net, full.names = TRUE, recursive = FALSE)
p_terms <- mapply(get_terms, paths, 16, SIMPLIFY = FALSE)
return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, gene_id, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
}, mc.cores = parallelly::availableCores())
} else {
# non-parallel version
Expand All @@ -211,7 +218,7 @@ get_metrics <- function(directory, organism = "hsapiens", get_sum = TRUE, get_pe
})
paths <- list.dirs(net, full.names = TRUE, recursive = FALSE)
p_terms <- mapply(get_terms, paths, 16, SIMPLIFY = FALSE)
return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
return(t(mapply(get_network_metrics, p_terms, network_sizes, organism, gene_id, get_sum, get_percent, get_mean, get_median, get_annotation_overlap, get_size, penalty, fdr_threshold, MoreArgs = list(go_ann = go_ann, go_reg = go_reg))))
})
}
df_list <- list()
Expand Down
6 changes: 3 additions & 3 deletions R/plot_metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perT
# if any of the networks whose metrics are being plotted has only one size,
# plotting with box plots for the permuted networks looks better
if (plot_with_boxes) {
plt <- plt + ggplot2::geom_boxplot(permuted, mapping=ggplot2::aes(x=size, y=metric, color = method, group = size)) +
ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network name", shape = "Network name")
plt <- plt + ggplot2::geom_boxplot(permuted, mapping=ggplot2::aes(x=size, y=metric, color = method, group = interaction(method, size))) +
ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network", shape = "Network")
} else {
medians = dplyr::summarise(group_by(permuted, network), median = median(metric))
medians <- tidyr::separate_wider_regex(medians, network, c(method = ".*", "_", size = ".*"), cols_remove = FALSE)
Expand All @@ -126,7 +126,7 @@ plot_metrics <- function(metric_dfs_by_net, title_text, subtitle_text = "", perT
for(net in unique(permuted$method)) {
plt <- plt + ggplot2::geom_line(medians[medians$method == net,], mapping = ggplot2::aes(x = size, y = median, color = method, linetype = ltype), linewidth = 1.5)
}
plt <- plt + ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network name", shape = "Network name", linetype = "Data")
plt <- plt + ggplot2::labs(x = ifelse(perTF, "Edges per TF", "Edges"), y = label_text[m], color = "Network", shape = "Network", linetype = "Data")
}
plot(plt)
}
Expand Down
17 changes: 9 additions & 8 deletions R/webgestalt_network.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,24 @@
#' directory for the output from all network subsets of a single original network.
#' @param network_name the name of the directory to store the results within the output_directory.
#' It should be in the same format as the files output by `subset_network`: "\{name\}_\{# of edges\}" i.e. "example_8".
#' @param organism human: "hsapiens"; yeast: "scerevisiae"
#' @param organism a string specifying the organism that the data is from, e.g.
#' "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism()
#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet()
#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
#' and see webgestalt.org for examples of each type
#' @param permutations the number of randomly permuted networks to create and run ORA on
#'
#' @details
#' The input network file should be a tab-separated .tsv where the first column is the
#' source nodes (TFs) and the second column is the target nodes (regulated genes).
#' All genes must be written as Ensembl gene IDs.
#'
#' @return NULL
#'
#' @export
webgestalt_network <- function(network_path, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", permutations = 10) {
webgestalt_network <- function(network_path, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", permutations = 10) {
METHOD="ORA" # ORA | GSEA | NTA
GENE_ID="ensembl_gene_id" # see options with listIdType() - set to ensembl_gene_id for now
# reports are more in-depth than summaries - advisable to keep reports FALSE if not needed
REPORTS_PATH=output_directory#"GO_results" # only used if GENERATE_REPORT=TRUE
REPORTS_PATH=output_directory # only used if GENERATE_REPORT=TRUE
GENERATE_REPORT=FALSE

# path must exist even if GENERATE_REPORT=FALSE
Expand All @@ -60,7 +61,7 @@ webgestalt_network <- function(network_path, reference_set, output_directory, ne
}

# finds the subset of genes in the reference set that are annotated to valid GO terms
ref_genes = WebGestaltR::idMapping(organism = organism, inputGene = read.table(reference_set)$V1, sourceIdType = GENE_ID, targetIdType = "entrezgene", host = "https://www.webgestalt.org/")
ref_genes = WebGestaltR::idMapping(organism = organism, inputGene = read.table(reference_set)$V1, sourceIdType = gene_id, targetIdType = "entrezgene", host = "https://www.webgestalt.org/")
annotations = suppressWarnings(WebGestaltR::loadGeneSet(organism = organism, enrichDatabase = database))
genes_per_term = table(annotations$geneSet$geneSet)
annotations_proper_size = annotations$geneSet[(genes_per_term[annotations$geneSet$geneSet] >= 10 & genes_per_term[annotations$geneSet$geneSet] <= 500),]
Expand Down Expand Up @@ -108,9 +109,9 @@ webgestalt_network <- function(network_path, reference_set, output_directory, ne
organism = organism,
enrichDatabase = database,
interestGeneFolder = file.path(work_dir,paste0("p",i-1)),
interestGeneType = GENE_ID,
interestGeneType = gene_id,
referenceGene = annotated_ref_genes$userId,
referenceGeneType = GENE_ID,
referenceGeneType = gene_id,
minNum = 10, # default 10
maxNum = 500, # default 500
reportNum = 20, # default 20
Expand Down
14 changes: 13 additions & 1 deletion man/evaluate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/get_annotation_overlap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions man/get_metrics.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e15b7df

Please sign in to comment.