-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from westbrooktm/dev
Documentation updates
- Loading branch information
Showing
244 changed files
with
117,724 additions
and
108,744 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,91 @@ | ||
#' evaluate | ||
|
||
#' Perform the whole GOeval pipeline on one network | ||
#' | ||
#' Perform the whole GOeval pipeline on one network. Given one network, this | ||
#' function will first make subsets with subset_network, then run ORA with | ||
#' webgestalt_network, followed by get_metrics and plot_metrics to plot summary | ||
#' statistics. | ||
#' The input file should be tab-separated with three columns: source node (e.g. transcription factor), | ||
#' target node (e.g. the regulated gene), and edge score. All genes should be written as Ensembl gene IDs. | ||
#' @description | ||
#' Given one network, `evaluate` will first make subsets with `subset_network`, | ||
#' then run Over-Representation Analysis (ORA) with `webgestalt_network`, | ||
#' followed by `get_metrics` and `plot_metrics` to plot selected summary statistics. | ||
#' | ||
#' @param network a file containing the network to create subsets from. The file should | ||
#' @param network path to the file containing the network to create subsets from. The file should | ||
#' be tab-separated with three columns: source node, target node, edge score | ||
#' @param reference_set path to the set of all genes possibly included in the network. Must be a .txt | ||
#' file containing exactly one column of the genes that could possibly appear in the network. | ||
#' @param output_directory path to the directory in which to store the generated network subsets, | ||
#' ORA summaries, and plots | ||
#' @param network_name short name for the network - used in file naming so may not contain spaces | ||
#' @param organism a string specifying the organism that the data is from, e.g. | ||
#' "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism() | ||
#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet(). | ||
#' Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE. | ||
#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType() | ||
#' and see webgestalt.org for examples of each type | ||
#' @param edges list of total numbers of edges or average edges per TF to include in each subset | ||
#' @param num_possible_TFs if set to a number > 0, the elements of 'edges' will first be | ||
#' multiplied by this number to get the number of edges for each subset | ||
#' @param permutations the number of randomly permuted networks to create and run ORA | ||
#' @param get_sum bool whether to get the 'sum' metric, which is the sum of the negative | ||
#' log base 10 of the p-value for the top term of each TF minus 3 times the total | ||
#' number of TFs. | ||
#' @param get_percent bool whether to get the 'percent' metric, which is the | ||
#' percent of TFs with at least one GO term with a FDR < 0.05 | ||
#' @param get_mean bool whether to get the 'mean' metric, which is the mean negative | ||
#' log base 10 of the p-value for the top term of each TF | ||
#' @param get_median bool whether to get the 'median' metric, which is the median negative | ||
#' log base 10 of the p-value for the top term of each TF | ||
#' @param get_annotation_overlap bool whether to get the 'annotation_overlap' metric, | ||
#' which is the percent of TFs that are annotated to a GO term for which their | ||
#' target genes are enriched | ||
#' @param get_size bool whether to get the 'size' metric, which is the number of | ||
#' TFs in the network subset that have more than one target gene with GO annotations | ||
#' @param plot bool whether to make a pdf containing plots of the calculated metrics | ||
#' @param permutations the number of randomly permuted networks to create and run ORA on | ||
#' @param penalty the penalty applied to the 'sum' metric for each TF in the network | ||
#' @param fdr_threshold the FDR threshold for a gene set term to be considered significantly | ||
#' over-represented for the purposes of calculating the 'percent' metric | ||
#' @param get_sum boolean whether to get and plot the 'sum' metric, which is the sum of the negative | ||
#' log base 10 of the p-value for the top term of each source node minus 'penalty' times the total | ||
#' number of source nodes. | ||
#' @param get_percent boolean whether to get and plot the 'percent' metric, which is the | ||
#' percent of source nodes with at least one term with a FDR below the 'fdr_threshold' | ||
#' @param get_mean boolean whether to get and plot the 'mean' metric, which is the mean negative | ||
#' log base 10 of the p-value for the top term of each source node regardless of significance | ||
#' @param get_median boolean whether to get and plot the 'median' metric, which is the median negative | ||
#' log base 10 of the p-value for the top term of each source node regardless of significance | ||
#' @param get_annotation_overlap boolean whether to get and plot the 'annotation_overlap' metric, | ||
#' which is the percent of source nodes that are annotated to at least one of the 16 GO terms for | ||
#' which their target genes are most enriched | ||
#' @param get_size boolean whether to get and plot the 'size' metric, which is the number of | ||
#' source nodes in the network subset that have more than one target gene with annotations. This number is | ||
#' used in the calculation of all other metrics. | ||
#' @param plot boolean whether to make plots of the calculated metrics and write them to a pdf | ||
#' | ||
#' @details | ||
#' The input file should be tab-separated with two or three columns: source node (e.g. transcription factor), | ||
#' target node (e.g. the regulated gene), and, optionally, edge score. | ||
#' | ||
#' @return output of `get_metrics`. Can be used as input to `plot_metrics`. | ||
#' | ||
#' @export | ||
evaluate <- function(network, reference_set, output_directory, network_name, edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0, | ||
permutations = 3, get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) { | ||
evaluate <- function(network, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", | ||
edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0, permutations = 3, penalty = 3, fdr_threshold = 0.05, | ||
get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) { | ||
|
||
# first package function | ||
subset_network(network, file.path(output_directory, paste0(network_name, "_subsets")), network_name, edges, num_possible_TFs) | ||
|
||
# for unique names to avoid overwriting | ||
formatted_time <- format(Sys.time(), "%Y-%m-%d-%H%M") | ||
summaries_suffix <- paste0("_summaries_", formatted_time) | ||
|
||
# sequential loop over all created subsets | ||
for (subset in list.files(file.path(output_directory, paste0(network_name, "_subsets")), full.names = TRUE)) { | ||
# second package function | ||
webgestalt_network(network_path = subset, | ||
reference_set = reference_set, | ||
output_directory = file.path(output_directory, paste0(network_name, "_summaries")), | ||
output_directory = file.path(output_directory, paste0(network_name, summaries_suffix)), | ||
# this just gets the name of each file minus the extension | ||
network_name = strsplit(basename(subset), "[.]")[[1]][1], | ||
organism = organism, | ||
database = database, | ||
gene_id = gene_id, | ||
permutations = permutations) | ||
} | ||
|
||
metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, "_summaries")), MoreArgs=list(get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, parallel = FALSE), SIMPLIFY = FALSE) | ||
# third package function | ||
# mapply returns a list of length 1 containing the output of get_metrics | ||
metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, summaries_suffix)), MoreArgs=list(organism = organism, database = database, gene_id = gene_id, get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, penalty = penalty, fdr_threshold = fdr_threshold, parallel = FALSE), SIMPLIFY = FALSE) | ||
|
||
if (plot) { | ||
pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots.pdf")), 7, 5) | ||
pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots_", formatted_time, ".pdf")), 7, 5) | ||
# fourth package function | ||
plot_data <- plot_metrics(metric_dfs_by_net, network_name, "", perTF = (num_possible_TFs > 0), sum = get_sum, percent = get_percent, mean = get_mean, median = get_median, annotation_overlap = get_annotation_overlap, size = get_size) | ||
dev.off() | ||
} | ||
|
||
# return output of get_metrics to easily re-plot with plot_metrics | ||
return(metric_dfs_by_net) | ||
} |
Oops, something went wrong.