Skip to content

Commit

Permalink
Merge pull request #25 from westbrooktm/dev
Browse files Browse the repository at this point in the history
Update to 1.0.0
  • Loading branch information
westbrooktm authored Jul 28, 2023
2 parents 3f20b1c + d3ba5b5 commit 2d35696
Show file tree
Hide file tree
Showing 13 changed files with 441 additions and 260 deletions.
2 changes: 2 additions & 0 deletions .lintr
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
linters: linters_with_defaults() # see vignette("lintr")
encoding: "UTF-8"
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: GOeval
Title: Gene Regulatory Network Evaluation Using Gene Ontology
Version: 0.0.0.9000
Version: 1.0.0
Authors@R:
person("Thomas", "Westbrook", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "YOUR-ORCID-ID"))
Expand Down
182 changes: 126 additions & 56 deletions R/evaluate.R
Original file line number Diff line number Diff line change
@@ -1,88 +1,158 @@

#' Perform the whole GOeval pipeline on one network
#'
#' @importFrom utils read.table
#'
#' @description
#' Given one network, `evaluate` will first make subsets with `subset_network`,
#' then run Over-Representation Analysis (ORA) with `webgestalt_network`,
#' followed by `get_metrics` and `plot_metrics` to plot selected summary statistics.
#' followed by `get_metrics` and `plot_metrics` to plot selected summary
#' statistics.
#'
#' @param network path to the file containing the network to create subsets from. The file should
#' be tab-separated with three columns: source node, target node, edge score
#' @param reference_set path to the set of all genes possibly included in the network. Must be a .txt
#' file containing exactly one column of the genes that could possibly appear in the network.
#' @param output_directory path to the directory in which to store the generated network subsets,
#' ORA summaries, and plots
#' @param network_name short name for the network - used in file naming so may not contain spaces
#' @param network path to the file containing the network to create subsets
#' from. The file should be tab-separated with three columns: source node,
#' target node, edge score
#' @param reference_set path to the set of all genes possibly included in the
#' network. Must be a .txt file containing exactly one column of the genes that
#' could possibly appear in the network.
#' @param output_directory path to the directory in which to store the generated
#' network subsets, ORA summaries, and plots
#' @param network_name short name for the network - used in file naming so may
#' not contain spaces
#' @param organism a string specifying the organism that the data is from, e.g.
#' "hsapiens" or "scerevisiae" - see options with WebGestaltR::listOrganism()
#' @param database the gene set database to search for enrichment - see options with WebGestaltR::listGeneSet().
#' Must be a Gene Ontology "biological process" database if get_annotation_overlap = TRUE.
#' @param gene_id the naming system used for the input genes - see options with WebGestaltR::listIdType()
#' and see webgestalt.org for examples of each type
#' @param edges list of total numbers of edges or average edges per TF to include in each subset
#' @param num_possible_TFs if set to a number > 0, the elements of 'edges' will first be
#' multiplied by this number to get the number of edges for each subset
#' @param permutations the number of randomly permuted networks to create and run ORA on
#' @param penalty the penalty applied to the 'sum' metric for each TF in the network
#' @param fdr_threshold the FDR threshold for a gene set term to be considered significantly
#' over-represented for the purposes of calculating the 'percent' metric
#' @param get_sum boolean whether to get and plot the 'sum' metric, which is the sum of the negative
#' log base 10 of the p-value for the top term of each source node minus 'penalty' times the total
#' number of source nodes.
#' @param get_percent boolean whether to get and plot the 'percent' metric, which is the
#' percent of source nodes with at least one term with a FDR below the 'fdr_threshold'
#' @param get_mean boolean whether to get and plot the 'mean' metric, which is the mean negative
#' log base 10 of the p-value for the top term of each source node regardless of significance
#' @param get_median boolean whether to get and plot the 'median' metric, which is the median negative
#' log base 10 of the p-value for the top term of each source node regardless of significance
#' @param get_annotation_overlap boolean whether to get and plot the 'annotation_overlap' metric,
#' which is the percent of source nodes that are annotated to at least one of the 16 GO terms for
#' which their target genes are most enriched
#' @param get_size boolean whether to get and plot the 'size' metric, which is the number of
#' source nodes in the network subset that have more than one target gene with annotations. This number is
#' used in the calculation of all other metrics.
#' @param plot boolean whether to make plots of the calculated metrics and write them to a pdf
#' @param database the gene set database to search for enrichment - see options
#' with WebGestaltR::listGeneSet(). Must be a Gene Ontology
#' "biological process" database if get_annotation_overlap = TRUE.
#' @param gene_id the naming system used for the input genes - see options with
#' WebGestaltR::listIdType() and see webgestalt.org for examples of each type
#' @param edges list of total numbers of edges or average edges per TF to
#' include in each subset
#' @param num_possible_TFs if set to a number > 0, the elements of 'edges' will
#' first be multiplied by this number to get the number of edges for each
#' subset
#' @param permutations the number of randomly permuted networks to create and
#' run ORA on
#' @param penalty the penalty applied to the 'sum' metric for each TF in the
#' network
#' @param fdr_threshold the FDR threshold for a gene set term to be considered
#' significantly over-represented for the purposes of calculating the 'percent'
#' metric
#' @param get_sum boolean whether to get and plot the 'sum' metric, which is the
#' sum of the negative log base 10 of the p-value for the top term of each
#' source node minus 'penalty' times the total number of source nodes.
#' @param get_percent boolean whether to get and plot the 'percent' metric,
#' which is the percent of source nodes with at least one term with a FDR below
#' the 'fdr_threshold'
#' @param get_mean boolean whether to get and plot the 'mean' metric, which is
#' the mean negative log base 10 of the p-value for the top term of each source
#' node regardless of significance
#' @param get_median boolean whether to get and plot the 'median' metric, which
#' is the median negative log base 10 of the p-value for the top term of each
#' source node regardless of significance
#' @param get_annotation_overlap boolean whether to get and plot the
#' 'annotation_overlap' metric, which is the percent of source nodes that are
#' annotated to at least one of the 16 GO terms for which their target genes
#' are most enriched
#' @param get_size boolean whether to get and plot the 'size' metric, which is
#' the number of source nodes in the network subset that have more than one
#' target gene with annotations. This number is used in the calculation of all
#' other metrics.
#' @param plot boolean whether to make plots of the calculated metrics and write
#' them to a pdf
#'
#' @details
#' The input file should be tab-separated with two or three columns: source node (e.g. transcription factor),
#' target node (e.g. the regulated gene), and, optionally, edge score.
#' The input file should be tab-separated with two or three columns: source node
#' (e.g. transcription factor), target node (e.g. the regulated gene), and,
#' optionally, edge score.
#'
#' @return output of `get_metrics`. Can be used as input to `plot_metrics`.
#'
#' @export
evaluate <- function(network, reference_set, output_directory, network_name, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id",
edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536), num_possible_TFs = 0, permutations = 3, penalty = 3, fdr_threshold = 0.05,
get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, plot = TRUE) {
evaluate <- function(network, reference_set, output_directory, network_name,
organism = "hsapiens",
database = "geneontology_Biological_Process_noRedundant",
gene_id = "ensembl_gene_id",
edges = c(512, 1024, 2048, 4096, 8192, 16384, 32768, 65536),
num_possible_TFs = 0, permutations = 3, penalty = 3,
fdr_threshold = 0.05, get_sum = TRUE, get_percent = FALSE,
get_mean = FALSE, get_median = FALSE,
get_annotation_overlap = FALSE, get_size = TRUE,
plot = TRUE) {

# check existence and format of `reference_set` before first function call
if(!file.exists(reference_set)) {
stop("Reference set file not found.")
}

ref_first_line <- tryCatch({
read.table(reference_set, nrows = 1)
}, error = function(e) {
stop("Error reading the reference set file.")
})

num_columns <- ncol(ref_first_line)
if (num_columns != 1) {
stop("Reference set file should contain only one column.")
}

# first package function
subset_network(network, file.path(output_directory, paste0(network_name, "_subsets")), network_name, edges, num_possible_TFs)
# existence and format of `network` and `network_name` checked here
subset_network(network, file.path(output_directory,
paste0(network_name, "_subsets")),
network_name, edges, num_possible_TFs)

# for unique names to avoid overwriting
formatted_time <- format(Sys.time(), "%Y-%m-%d-%H%M")
summaries_suffix <- paste0("_summaries_", formatted_time)

# sequential loop over all created subsets
for (subset in list.files(file.path(output_directory, paste0(network_name, "_subsets")), full.names = TRUE)) {
# second package function
webgestalt_network(network_path = subset,
reference_set = reference_set,
output_directory = file.path(output_directory, paste0(network_name, summaries_suffix)),
# this just gets the name of each file minus the extension
network_name = strsplit(basename(subset), "[.]")[[1]][1],
organism = organism,
database = database,
gene_id = gene_id,
permutations = permutations)
for (subset in list.files(file.path(output_directory, paste0(network_name,
"_subsets")),
full.names = TRUE)) {
# second package function
webgestalt_network(
network_path = subset,
reference_set = reference_set,
output_directory = file.path(output_directory, paste0(network_name,
summaries_suffix)),
# this just gets the name of each file minus the extension
network_name = strsplit(basename(subset), "[.]")[[1]][1],
organism = organism,
database = database,
gene_id = gene_id,
permutations = permutations
)
}

# third package function
# mapply returns a list of length 1 containing the output of get_metrics
metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory, paste0(network_name, summaries_suffix)), MoreArgs=list(organism = organism, database = database, gene_id = gene_id, get_sum = get_sum, get_percent = get_percent, get_mean = get_mean, get_median = get_median, get_annotation_overlap = get_annotation_overlap, get_size = get_size, penalty = penalty, fdr_threshold = fdr_threshold, parallel = FALSE), SIMPLIFY = FALSE)
metric_dfs_by_net <- mapply(get_metrics, file.path(output_directory,
paste0(network_name,
summaries_suffix)),
MoreArgs = list(organism = organism,
database = database,
gene_id = gene_id,
get_sum = get_sum,
get_percent = get_percent,
get_mean = get_mean,
get_median = get_median,
get_annotation_overlap = get_annotation_overlap,
get_size = get_size,
penalty = penalty,
fdr_threshold = fdr_threshold,
parallel = FALSE),
SIMPLIFY = FALSE)

if (plot) {
pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots_", formatted_time, ".pdf")), 7, 5)
pdf(file.path(output_directory, paste0(network_name, "_ORA_metrics_plots_",
formatted_time, ".pdf")), 7, 5)
# fourth package function
plot_data <- plot_metrics(metric_dfs_by_net, network_name, "", perTF = (num_possible_TFs > 0), sum = get_sum, percent = get_percent, mean = get_mean, median = get_median, annotation_overlap = get_annotation_overlap, size = get_size)
plot_data <- plot_metrics(metric_dfs_by_net, network_name, "",
perTF = (num_possible_TFs > 0), sum = get_sum,
percent = get_percent, mean = get_mean,
median = get_median,
annotation_overlap = get_annotation_overlap,
size = get_size)
dev.off()
}

Expand Down
15 changes: 10 additions & 5 deletions R/get_metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ get_network_metrics <- function(full_terms, network_size, organism, gene_id, get
neglogp <- -log10(terms$pValue)
# cap -logp values due to rounding to 0 for pval < 1E-16
neglogp <- ifelse(is.finite(neglogp), neglogp, 16)
#neglogp_zeros <- append(rep(0, network_size - length(terms$geneSet)), neglogp)
# neglogp_zeros <- append(rep(0, network_size - length(terms$geneSet)), neglogp)
if (get_annotation_overlap) {
prior_ann <- 100 * length(get_annotation_overlap(full_terms, organism, gene_id, go_ann = go_ann, go_reg = go_reg)) / network_size
} else {
Expand Down Expand Up @@ -167,13 +167,18 @@ get_network_metrics <- function(full_terms, network_size, organism, gene_id, get
#'
#' @export
get_metrics <- function(directory, organism = "hsapiens", database = "geneontology_Biological_Process_noRedundant", gene_id = "ensembl_gene_id", get_sum = TRUE, get_percent = FALSE, get_mean = FALSE, get_median = FALSE, get_annotation_overlap = FALSE, get_size = TRUE, penalty = 3, fdr_threshold = 0.05, parallel = FALSE) {

if (!dir.exists(directory)) {
stop("Directory not found.")
}

# GO regulatory relationships only needed if get_annotation_overlap = TRUE
if (get_annotation_overlap) {
# Load regulatory relationships between GO terms for the calculation of overlap between TF GO
# annotations and their targets' enriched GO terms.
go <- as.data.frame(ontologyIndex::get_ontology(file = "http://purl.obolibrary.org/obo/go/go-basic.obo", extract_tags = "everything"))
go <- go[go$namespace == 'biological_process',]
go_reg <- go[c('id', 'regulates', 'negatively_regulates', 'positively_regulates')]
go <- go[go$namespace == "biological_process", ]
go_reg <- go[c("id", "regulates", "negatively_regulates", "positively_regulates")]
rm(go)

# Load gene annotations with WebGestaltR::loadGeneSet.
Expand All @@ -200,7 +205,7 @@ get_metrics <- function(directory, organism = "hsapiens", database = "geneontolo
# but also sometimes it works, so possible race condition although I'm not sure how
results <- parallel::mclapply(networks_list, function(net) {
network_data_files <- list.files(net, recursive = TRUE, pattern = "network_data.txt", full.names = TRUE)
network_sizes <- lapply(network_data_files, function (f) {
network_sizes <- lapply(network_data_files, function(f) {
size_line <- grep("valid", readLines(f), value = TRUE)
return(as.integer(unlist(strsplit(size_line, " "))[1]))
})
Expand All @@ -212,7 +217,7 @@ get_metrics <- function(directory, organism = "hsapiens", database = "geneontolo
# non-parallel version
results <- lapply(networks_list, function(net) {
network_data_files <- list.files(net, recursive = TRUE, pattern = "network_data.txt", full.names = TRUE)
network_sizes <- lapply(network_data_files, function (f) {
network_sizes <- lapply(network_data_files, function(f) {
size_line <- grep("valid", readLines(f), value = TRUE)
return(as.integer(unlist(strsplit(size_line, " "))[1]))
})
Expand Down
Loading

0 comments on commit 2d35696

Please sign in to comment.