saezlab · PauBadiaM · Oct 19, 2024 · Jul 18, 2024
diff --git a/R/statistic-zscore.R b/R/statistic-zscore.R
@@ -0,0 +1,152 @@
+#' z-score
+#'
+#' @description
+#' Calculates regulatory activities using a z-score as descibed in KSEA or RoKAI.
+#'
+#' @details
+#' The z-score calculates the mean of the molecular features of the known targets
+#' for each regulator and adjusts it for the number of identified targets for the 
+#' regulator, the standard deviation of all molecular features (RoKAI), as well as
+#' the mean of all moleculare features (KSEA). 
+#'
+#' @inheritParams .decoupler_mat_format
+#' @inheritParams .decoupler_network_format
+#' @param sparse Deprecated parameter.
+#' @param center Logical value indicating if `mat` must be centered by
+#' [base::rowMeans()].
+#' @param na.rm Should missing values (including NaN) be omitted from the
+#'  calculations of [base::rowMeans()]?
+#' @param minsize Integer indicating the minimum number of targets per source.
+#' @param flavor Whether the calculation should be based on RoKAI (default) or
+#'  KSEA.
+#'
+#' @return A long format tibble of the enrichment scores for each source
+#'  across the samples. Resulting tibble contains the following columns:
+#'  1. `statistic`: Indicates which method is associated with which score.
+#'  2. `source`: Source nodes of `network`.
+#'  3. `condition`: Condition representing each column of `mat`.
+#'  4. `score`: Regulatory activity (enrichment score).
+#' @family decoupleR statistics
+#' @export
+#'
+#' @importFrom stats coef lm summary.lm
+#' @importFrom magrittr %<>% %>%
+#' @importFrom dplyr ungroup
+#' @examples
+#' inputs_dir <- system.file("testdata", "inputs", package = "decoupleR")
+#'
+#' mat <- readRDS(file.path(inputs_dir, "mat.rds"))
+#' net <- readRDS(file.path(inputs_dir, "net.rds"))
+#'
+#' run_zscore(mat, net, minsize=0)
+run_zscore <- function(mat,
+                    network,
+                    .source = source,
+                    .target = target,
+                    .mor = mor,
+                    .likelihood = likelihood,
+                    sparse = FALSE,
+                    center = FALSE,
+                    na.rm = FALSE,
+                    minsize = 5L,
+                    flavor = "RoKAI"
+) {
+
+  # NSE vs. R CMD check workaround
+  condition <- likelihood <- mor <- p_value <- score <-
+    source <- statistic <- target <- NULL
+
+  # Check for NAs/Infs in mat
+  mat %<>% check_nas_infs
+
+  network %>%
+    # Convert to standard tibble: source-target-mor.
+    rename_net(
+      {{ .source }},
+      {{ .target }},
+      {{ .mor }},
+      {{ .likelihood }}
+    ) %>%
+    filt_minsize(rownames(mat), ., minsize) %>%
+    # Preprocessing -------------------------------------------------------
+  .fit_preprocessing(mat, center, na.rm, sparse) %>%
+    # Model evaluation ----------------------------------------------------
+  {.zscore_analysis(.$mat, .$mor_mat, flavor)} %>%
+    ungroup()
+
+}
+
+#' Wrapper to execute run_zscore() logic on preprocessed data
+#'
+#' Calculate a z-score from the molecular features of its targets and 
+#' normalise it by the background molecular features.
+#'
+#' @inheritParams run_zscore
+#' @param mor_mat
+#'
+#' @inherit run_zscore return
+#' @keywords intern
+#' @importFrom stats pnorm
+#' @importFrom dplyr filter rename mutate group_by summarise arrange
+#' @importFrom tibble rownames_to_column tibble
+#' @importFrom tidyr pivot_longer drop_na
+#' @importFrom purrr map_dfr
+#' @importFrom magrittr %<>% %>%
+#' @noRd
+.zscore_analysis <- function(mat, mor_mat, flavor) {
+  net <- mor_mat %>%
+    as.data.frame() %>%
+    rownames_to_column("target") %>%
+    pivot_longer(cols = -target, values_to = "mor", names_to = "source") %>%
+    filter(mor != 0)
+
+  scores <- purrr::map_dfr(seq_len(ncol(mat)), function(exp){
+    # Convert column of mat to data frame and drop NA rows
+    mat_c <- mat[, exp, drop = FALSE] %>%
+      data.frame() %>%
+      drop_na() %>%
+      rownames_to_column("target")
+
+    # Perform inner join and filter NA rows
+    KSdata <- full_join(net, mat_c, by = "target") %>%
+      drop_na() %>%
+      rename("stat" = colnames(.)[4])
+
+    # Calculate value based on mor and stat
+    KSdata <- KSdata %>%
+      mutate(value = mor * stat)
+
+    # Aggregate mean values for each source
+    Mean.FC <- KSdata %>%
+      group_by(source) %>%
+      summarise(mS = mean(value), m = n()) %>%
+      arrange(source)
+
+    # Calculate Enrichment and z-score
+    mean_value <- mean(mat_c[, 2], na.rm = TRUE)
+    if (flavor == "RoKAI") {
+      mean_value <- 0
+    }
+    Mean.FC <- Mean.FC %>%
+      mutate(
+        Enrichment = mS / abs(mean(mat_c[,2], na.rm = TRUE)),
+        z.score = ((mS - mean_value) * sqrt(m)) / sd(mat_c[, 2], na.rm = TRUE),
+        p.value = pnorm(-abs(z.score))
+      )
+
+    # Prepare and return result as tibble
+    tibble(
+      statistic = "z_score",
+      source = Mean.FC$source,
+      condition = colnames(mat_c)[2],
+      score = Mean.FC$z.score,
+      p_value = Mean.FC$p.value
+    )
+  })
+  # Arrange scores according to the order of sources in mor_mat
+  scores <- scores %>%
+    arrange(match(source, colnames(mor_mat)))
+
+  return(scores)
+}
+
diff --git a/tests/testthat/test-statistic-zscore.R b/tests/testthat/test-statistic-zscore.R
@@ -0,0 +1,31 @@
+library(decoupleR)
+
+# Directories -------------------------------------------------------------
+
+# Inputs
+input_dir <- system.file("testdata", "inputs", package = "decoupleR")
+
+
+# Data to run -------------------------------------------------------------
+
+mat <- file.path(input_dir, "mat.rds") %>%
+  readRDS()
+
+net <- file.path(input_dir, "net.rds") %>%
+  readRDS()
+
+# Test for run_wmean function ---------------------------------------------
+
+test_that("test run_zscore", {
+  res_1 <- run_zscore(mat, net, minsize=0)
+
+  expect_equal(res_1$score[1], 3.52, tolerance=0.01)
+  expect_equal(sign(range(res_1$score)), c(-1, 1))
+
+  net_2 <- net
+  net_2$mor <- 1
+  res_2 <- run_zscore(mat, net_2, minsize=0)
+
+  expect_equal(res_2$score[1], 3.89, tolerance=0.01)
+  expect_equal(sign(range(res_2$score)), c(1, 1))
+})