From f9b85c4cef03b0fc9ecf6d70263a2742542240ac Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Oct 2024 17:43:34 -0700 Subject: [PATCH] Tweak revision_summary tidyselect, remove redundant arrange * Produce error rather than default selection when user provides a tidyselection in ... but it selects zero columns. * Change time_within_x_latest to take `values` as a vector * Use `.data` instead of `pick` etc. in some places --- NAMESPACE | 1 + R/key_colnames.R | 2 +- R/revision_analysis.R | 48 +++++++++++-------- man/revision_summary.Rd | 2 +- man/time_within_x_latest.Rd | 2 +- .../test-revision-latency-functions.R | 39 ++++++++------- 6 files changed, 54 insertions(+), 40 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f9148caf..a6910df5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -191,6 +191,7 @@ importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) importFrom(rlang,check_dots_empty0) +importFrom(rlang,dots_n) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) diff --git a/R/key_colnames.R b/R/key_colnames.R index ce2825aa..3340cc80 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -7,7 +7,7 @@ #' @param x an object, such as an [`epi_df`] #' @param ... additional arguments passed on to methods #' @param other_keys character vector; what besides `geo_value` and `time_value` -#' (if present) should we consider to be key columns? Used, e.g., if we +#' (if present) should we consider to be key columns? Used, e.g., if we #' @param exclude an optional character vector of key column names to exclude #' from the result #' @return character vector diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 7de9fa5b..6c389b0c 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -19,7 +19,7 @@ #' be `NA` whenever `spread` is 0. #' 9. `time_near_latest`: This gives the lag when the value is within #' `within_latest` (default 20%) of the value at the latest time. For example, -#' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is +#' consider the series (0, 20, 99, 150, 102, 100); then `time_near_latest` is #' the 5th index, since even though 99 is within 20%, it is outside the window #' afterwards at 150. #' @param epi_arch an epi_archive to be analyzed @@ -64,7 +64,7 @@ #' revision_example %>% arrange(desc(spread)) #' @export #' @importFrom cli cli_inform cli_abort cli_li -#' @importFrom rlang list2 syms +#' @importFrom rlang list2 syms dots_n #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across #' everything ungroup summarize if_else %>% revision_summary <- function(epi_arch, @@ -80,12 +80,19 @@ revision_summary <- function(epi_arch, compactify_tol = .Machine$double.eps^0.5, should_compactify = TRUE) { assert_class(epi_arch, "epi_archive") - arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) - if (length(arg) == 0) { - # Choose the first column that's not a key or version + if (dots_n(...) == 0) { + # Choose the first column that's not a key: arg <- setdiff(names(epi_arch$DT), key_colnames(epi_arch))[[1]] - } else if (length(arg) > 1) { - cli_abort("Not currently implementing more than one column at a time. Run each separately") + } else { + arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) + if (length(arg) == 0) { + cli_abort("Could not find any columns matching the selection in `...`.", + class = "epiprocess__revision_summary__selected_zero_columns" + ) + } + if (length(arg) > 1) { + cli_abort("Not currently implementing more than one column at a time. Run each separately.") + } } if (is.null(abs_spread_threshold)) { abs_spread_threshold <- .05 * epi_arch$DT %>% @@ -99,10 +106,12 @@ revision_summary <- function(epi_arch, # the max lag # # revision_tibble - keys <- key_colnames(epi_arch, exclude = "version") + epikey_names <- key_colnames(epi_arch, exclude = c("time_value", "version")) + epikeytime_names <- c(epikey_names, "time_value") + keys <- c(epikeytime_names, "version") revision_behavior <- epi_arch$DT %>% - select(all_of(unique(c("geo_value", "time_value", keys, "version", arg)))) + select(all_of(unique(c(keys, arg)))) if (!is.null(min_waiting_period)) { revision_behavior <- revision_behavior %>% filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period) @@ -112,27 +121,26 @@ revision_summary <- function(epi_arch, # if we're dropping NA's, we should recompactify revision_behavior <- revision_behavior %>% - filter(!is.na(c_across(!!arg))) + filter(!is.na(.data[[arg]])) } else { revision_behavior <- epi_arch$DT } if (should_compactify) { revision_behavior <- revision_behavior %>% - arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying - apply_compactify(c(keys, version), compactify_tol) + apply_compactify(keys, compactify_tol) } revision_behavior <- revision_behavior %>% mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter - group_by(across(all_of(keys))) %>% # group by all the keys + group_by(across(all_of(epikeytime_names))) %>% # group = versions of one measurement summarize( n_revisions = dplyr::n() - 1, min_lag = min(lag), # nolint: object_usage_linter max_lag = max(lag), # nolint: object_usage_linter - min_value = f_no_na(min, pick(!!arg)), - max_value = f_no_na(max, pick(!!arg)), - median_value = f_no_na(median, pick(!!arg)), - time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter + min_value = f_no_na(min, .data[[arg]]), + max_value = f_no_na(max, .data[[arg]]), + median_value = f_no_na(median, .data[[arg]]), + time_to = time_within_x_latest(lag, .data[[arg]], prop = within_latest), .groups = "drop" ) %>% mutate( @@ -145,7 +153,7 @@ revision_summary <- function(epi_arch, ) %>% select(-time_to) %>% relocate( - time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter + time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter ) if (print_inform) { @@ -203,10 +211,9 @@ revision_summary <- function(epi_arch, } #' pull the value from lags when values starts indefinitely being within prop of it's last value. -#' @param values this should be a 1 column tibble. errors may occur otherwise +#' @param values this should be a vector (e.g., a column). errors may occur otherwise #' @keywords internal time_within_x_latest <- function(lags, values, prop = .2) { - values <- values[[1]] latest_value <- values[[length(values)]] close_enough <- abs(values - latest_value) < prop * latest_value # we want to ignore any stretches where it's close, but goes farther away later @@ -222,7 +229,6 @@ time_within_x_latest <- function(lags, values, prop = .2) { #' @keywords internal get_last_run <- function(bool_vec, values_from) { runs <- rle(bool_vec) - length(bool_vec) - tail(runs$lengths, n = 1) values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]] } diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd index 590a1ed5..10a05046 100644 --- a/man/revision_summary.Rd +++ b/man/revision_summary.Rd @@ -88,7 +88,7 @@ always be less than 1). Note that this need not be the final value. It will be \code{NA} whenever \code{spread} is 0. \item \code{time_near_latest}: This gives the lag when the value is within \code{within_latest} (default 20\%) of the value at the latest time. For example, -consider the series (0,20, 99, 150, 102, 100); then \code{time_near_latest} is +consider the series (0, 20, 99, 150, 102, 100); then \code{time_near_latest} is the 5th index, since even though 99 is within 20\%, it is outside the window afterwards at 150. } diff --git a/man/time_within_x_latest.Rd b/man/time_within_x_latest.Rd index 1dd7e801..9faa3160 100644 --- a/man/time_within_x_latest.Rd +++ b/man/time_within_x_latest.Rd @@ -7,7 +7,7 @@ time_within_x_latest(lags, values, prop = 0.2) } \arguments{ -\item{values}{this should be a 1 column tibble. errors may occur otherwise} +\item{values}{this should be a vector (e.g., a column). errors may occur otherwise} } \description{ pull the value from lags when values starts indefinitely being within prop of it's last value. diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index df10937a..129deba5 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -1,17 +1,5 @@ dummy_ex <- tibble::tribble( ~geo_value, ~time_value, ~version, ~value, - # al 1 has 1 real revision, a lag of 0, and changes by 99 - "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, - # al 2 has no revision, a min lag of 0, and a rel_spread of 0 - "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, - # al 3 has 1 revision and a min lag of 1, and a change of 3 - "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, - "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, - # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 - "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, - "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, # ak 1 has 4 revisions w/out NAs, but 6 with NAs # a min lag of 2, and a change of 101 "ak", as.Date("2020-01-01"), as.Date("2020-01-03"), 1, @@ -27,6 +15,18 @@ dummy_ex <- tibble::tribble( # ak 3 has 0 revisions, and a value of zero, and thus a rel_spread of NaN "ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0, "ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0, + # al 1 has 1 real revision, a lag of 0, and changes by 99 + "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, + # al 2 has no revision, a min lag of 0, and a rel_spread of 0 + "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, + # al 3 has 1 revision and a min lag of 1, and a change of 3 + "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, + "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, + # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 + "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, + "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, ) %>% as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE) @@ -40,12 +40,19 @@ test_that("tidyselect is functional", { with_later_key_col <- dummy_ex$DT %>% select(geo_value, time_value, value, version) %>% as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) - expect_equal(quiet(revision_summary(with_later_key_col)), - quiet(revision_summary(dummy_ex))) + expect_equal( + quiet(revision_summary(with_later_key_col)), + quiet(revision_summary(dummy_ex)) + ) with_later_val_col <- dummy_ex$DT %>% mutate(value2 = 0) %>% as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) - expect_equal(quiet(revision_summary(with_later_val_col, value)), - quiet(revision_summary(dummy_ex, value))) + expect_equal( + quiet(revision_summary(with_later_val_col, value)), + quiet(revision_summary(dummy_ex, value)) + ) + expect_error(revision_summary(with_later_val_col, !everything()), + class = "epiprocess__revision_summary__selected_zero_columns" + ) }) test_that("revision_summary works for various timetypes", {})