From f9b85c4cef03b0fc9ecf6d70263a2742542240ac Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Wed, 9 Oct 2024 17:43:34 -0700
Subject: [PATCH] Tweak revision_summary tidyselect, remove redundant arrange

* Produce error rather than default selection when user provides a tidyselection
  in ... but it selects zero columns.
* Change time_within_x_latest to take `values` as a vector
* Use `.data` instead of `pick` etc. in some places
---
 NAMESPACE                                     |  1 +
 R/key_colnames.R                              |  2 +-
 R/revision_analysis.R                         | 48 +++++++++++--------
 man/revision_summary.Rd                       |  2 +-
 man/time_within_x_latest.Rd                   |  2 +-
 .../test-revision-latency-functions.R         | 39 ++++++++-------
 6 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index f9148caf..a6910df5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -191,6 +191,7 @@ importFrom(rlang,caller_arg)
 importFrom(rlang,caller_env)
 importFrom(rlang,check_dots_empty)
 importFrom(rlang,check_dots_empty0)
+importFrom(rlang,dots_n)
 importFrom(rlang,enquo)
 importFrom(rlang,enquos)
 importFrom(rlang,env)
diff --git a/R/key_colnames.R b/R/key_colnames.R
index ce2825aa..3340cc80 100644
--- a/R/key_colnames.R
+++ b/R/key_colnames.R
@@ -7,7 +7,7 @@
 #' @param x an object, such as an [`epi_df`]
 #' @param ... additional arguments passed on to methods
 #' @param other_keys character vector; what besides `geo_value` and `time_value`
-#'   (if present) should we consider to be key columns?  Used, e.g., if we 
+#'   (if present) should we consider to be key columns?  Used, e.g., if we
 #' @param exclude an optional character vector of key column names to exclude
 #'   from the result
 #' @return character vector
diff --git a/R/revision_analysis.R b/R/revision_analysis.R
index 7de9fa5b..6c389b0c 100644
--- a/R/revision_analysis.R
+++ b/R/revision_analysis.R
@@ -19,7 +19,7 @@
 #'   be `NA` whenever `spread` is 0.
 #'  9. `time_near_latest`: This gives the lag when the value is within
 #'   `within_latest` (default 20%) of the value at the latest time. For example,
-#'   consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is
+#'   consider the series (0, 20, 99, 150, 102, 100); then `time_near_latest` is
 #'   the 5th index, since even though 99 is within 20%, it is outside the window
 #'   afterwards at 150.
 #' @param epi_arch an epi_archive to be analyzed
@@ -64,7 +64,7 @@
 #' revision_example %>% arrange(desc(spread))
 #' @export
 #' @importFrom cli cli_inform cli_abort cli_li
-#' @importFrom rlang list2 syms
+#' @importFrom rlang list2 syms dots_n
 #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across
 #'   everything ungroup summarize if_else %>%
 revision_summary <- function(epi_arch,
@@ -80,12 +80,19 @@ revision_summary <- function(epi_arch,
                              compactify_tol = .Machine$double.eps^0.5,
                              should_compactify = TRUE) {
   assert_class(epi_arch, "epi_archive")
-  arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
-  if (length(arg) == 0) {
-    # Choose the first column that's not a key or version
+  if (dots_n(...) == 0) {
+    # Choose the first column that's not a key:
     arg <- setdiff(names(epi_arch$DT), key_colnames(epi_arch))[[1]]
-  } else if (length(arg) > 1) {
-    cli_abort("Not currently implementing more than one column at a time. Run each separately")
+  } else {
+    arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
+    if (length(arg) == 0) {
+      cli_abort("Could not find any columns matching the selection in `...`.",
+        class = "epiprocess__revision_summary__selected_zero_columns"
+      )
+    }
+    if (length(arg) > 1) {
+      cli_abort("Not currently implementing more than one column at a time. Run each separately.")
+    }
   }
   if (is.null(abs_spread_threshold)) {
     abs_spread_threshold <- .05 * epi_arch$DT %>%
@@ -99,10 +106,12 @@ revision_summary <- function(epi_arch,
   #   the max lag
   #
   # revision_tibble
-  keys <- key_colnames(epi_arch, exclude = "version")
+  epikey_names <- key_colnames(epi_arch, exclude = c("time_value", "version"))
+  epikeytime_names <- c(epikey_names, "time_value")
+  keys <- c(epikeytime_names, "version")
 
   revision_behavior <- epi_arch$DT %>%
-    select(all_of(unique(c("geo_value", "time_value", keys, "version", arg))))
+    select(all_of(unique(c(keys, arg))))
   if (!is.null(min_waiting_period)) {
     revision_behavior <- revision_behavior %>%
       filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period)
@@ -112,27 +121,26 @@ revision_summary <- function(epi_arch,
     # if we're dropping NA's, we should recompactify
     revision_behavior <-
       revision_behavior %>%
-      filter(!is.na(c_across(!!arg)))
+      filter(!is.na(.data[[arg]]))
   } else {
     revision_behavior <- epi_arch$DT
   }
   if (should_compactify) {
     revision_behavior <- revision_behavior %>%
-      arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
-      apply_compactify(c(keys, version), compactify_tol)
+      apply_compactify(keys, compactify_tol)
   }
   revision_behavior <-
     revision_behavior %>%
     mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter
-    group_by(across(all_of(keys))) %>% # group by all the keys
+    group_by(across(all_of(epikeytime_names))) %>% # group = versions of one measurement
     summarize(
       n_revisions = dplyr::n() - 1,
       min_lag = min(lag), # nolint: object_usage_linter
       max_lag = max(lag), # nolint: object_usage_linter
-      min_value = f_no_na(min, pick(!!arg)),
-      max_value = f_no_na(max, pick(!!arg)),
-      median_value = f_no_na(median, pick(!!arg)),
-      time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter
+      min_value = f_no_na(min, .data[[arg]]),
+      max_value = f_no_na(max, .data[[arg]]),
+      median_value = f_no_na(median, .data[[arg]]),
+      time_to = time_within_x_latest(lag, .data[[arg]], prop = within_latest),
       .groups = "drop"
     ) %>%
     mutate(
@@ -145,7 +153,7 @@ revision_summary <- function(epi_arch,
     ) %>%
     select(-time_to) %>%
     relocate(
-      time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter
+      time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter
       time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter
     )
   if (print_inform) {
@@ -203,10 +211,9 @@ revision_summary <- function(epi_arch,
 }
 
 #' pull the value from lags when values starts indefinitely being within prop of it's last value.
-#' @param values this should be a 1 column tibble. errors may occur otherwise
+#' @param values this should be a vector (e.g., a column). errors may occur otherwise
 #' @keywords internal
 time_within_x_latest <- function(lags, values, prop = .2) {
-  values <- values[[1]]
   latest_value <- values[[length(values)]]
   close_enough <- abs(values - latest_value) < prop * latest_value
   # we want to ignore any stretches where it's close, but goes farther away later
@@ -222,7 +229,6 @@ time_within_x_latest <- function(lags, values, prop = .2) {
 #' @keywords internal
 get_last_run <- function(bool_vec, values_from) {
   runs <- rle(bool_vec)
-  length(bool_vec) - tail(runs$lengths, n = 1)
   values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]]
 }
 
diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd
index 590a1ed5..10a05046 100644
--- a/man/revision_summary.Rd
+++ b/man/revision_summary.Rd
@@ -88,7 +88,7 @@ always be less than 1). Note that this need not be the final value. It will
 be \code{NA} whenever \code{spread} is 0.
 \item \code{time_near_latest}: This gives the lag when the value is within
 \code{within_latest} (default 20\%) of the value at the latest time. For example,
-consider the series (0,20, 99, 150, 102, 100); then \code{time_near_latest} is
+consider the series (0, 20, 99, 150, 102, 100); then \code{time_near_latest} is
 the 5th index, since even though 99 is within 20\%, it is outside the window
 afterwards at 150.
 }
diff --git a/man/time_within_x_latest.Rd b/man/time_within_x_latest.Rd
index 1dd7e801..9faa3160 100644
--- a/man/time_within_x_latest.Rd
+++ b/man/time_within_x_latest.Rd
@@ -7,7 +7,7 @@
 time_within_x_latest(lags, values, prop = 0.2)
 }
 \arguments{
-\item{values}{this should be a 1 column tibble. errors may occur otherwise}
+\item{values}{this should be a vector (e.g., a column). errors may occur otherwise}
 }
 \description{
 pull the value from lags when values starts indefinitely being within prop of it's last value.
diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R
index df10937a..129deba5 100644
--- a/tests/testthat/test-revision-latency-functions.R
+++ b/tests/testthat/test-revision-latency-functions.R
@@ -1,17 +1,5 @@
 dummy_ex <- tibble::tribble(
   ~geo_value, ~time_value, ~version, ~value,
-  # al 1 has 1 real revision, a lag of 0, and changes by 99
-  "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1,
-  "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1,
-  "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100,
-  # al 2 has no revision, a min lag of 0, and a rel_spread of 0
-  "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1,
-  # al 3 has 1 revision and a min lag of 1, and a change of 3
-  "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1,
-  "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4,
-  # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0
-  "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA,
-  "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9,
   # ak 1 has 4 revisions w/out NAs, but 6 with NAs
   # a min lag of 2, and a change of 101
   "ak", as.Date("2020-01-01"), as.Date("2020-01-03"), 1,
@@ -27,6 +15,18 @@ dummy_ex <- tibble::tribble(
   # ak 3 has 0 revisions, and a value of zero, and thus a rel_spread of NaN
   "ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0,
   "ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0,
+  # al 1 has 1 real revision, a lag of 0, and changes by 99
+  "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1,
+  "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1,
+  "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100,
+  # al 2 has no revision, a min lag of 0, and a rel_spread of 0
+  "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1,
+  # al 3 has 1 revision and a min lag of 1, and a change of 3
+  "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1,
+  "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4,
+  # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0
+  "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA,
+  "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9,
 ) %>%
   as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE)
 
@@ -40,12 +40,19 @@ test_that("tidyselect is functional", {
   with_later_key_col <- dummy_ex$DT %>%
     select(geo_value, time_value, value, version) %>%
     as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE)
-  expect_equal(quiet(revision_summary(with_later_key_col)),
-               quiet(revision_summary(dummy_ex)))
+  expect_equal(
+    quiet(revision_summary(with_later_key_col)),
+    quiet(revision_summary(dummy_ex))
+  )
   with_later_val_col <- dummy_ex$DT %>%
     mutate(value2 = 0) %>%
     as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE)
-  expect_equal(quiet(revision_summary(with_later_val_col, value)),
-               quiet(revision_summary(dummy_ex, value)))
+  expect_equal(
+    quiet(revision_summary(with_later_val_col, value)),
+    quiet(revision_summary(dummy_ex, value))
+  )
+  expect_error(revision_summary(with_later_val_col, !everything()),
+    class = "epiprocess__revision_summary__selected_zero_columns"
+  )
 })
 test_that("revision_summary works for various timetypes", {})