Skip to content

Commit

Permalink
Tweak revision_summary tidyselect, remove redundant arrange
Browse files Browse the repository at this point in the history
* Produce error rather than default selection when user provides a tidyselection
  in ... but it selects zero columns.
* Change time_within_x_latest to take `values` as a vector
* Use `.data` instead of `pick` etc. in some places
  • Loading branch information
brookslogan committed Oct 10, 2024
1 parent 2369fc0 commit f9b85c4
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 40 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ importFrom(rlang,caller_arg)
importFrom(rlang,caller_env)
importFrom(rlang,check_dots_empty)
importFrom(rlang,check_dots_empty0)
importFrom(rlang,dots_n)
importFrom(rlang,enquo)
importFrom(rlang,enquos)
importFrom(rlang,env)
Expand Down
2 changes: 1 addition & 1 deletion R/key_colnames.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#' @param x an object, such as an [`epi_df`]
#' @param ... additional arguments passed on to methods
#' @param other_keys character vector; what besides `geo_value` and `time_value`
#' (if present) should we consider to be key columns? Used, e.g., if we
#' (if present) should we consider to be key columns? Used, e.g., if we
#' @param exclude an optional character vector of key column names to exclude
#' from the result
#' @return character vector
Expand Down
48 changes: 27 additions & 21 deletions R/revision_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#' be `NA` whenever `spread` is 0.
#' 9. `time_near_latest`: This gives the lag when the value is within
#' `within_latest` (default 20%) of the value at the latest time. For example,
#' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is
#' consider the series (0, 20, 99, 150, 102, 100); then `time_near_latest` is
#' the 5th index, since even though 99 is within 20%, it is outside the window
#' afterwards at 150.
#' @param epi_arch an epi_archive to be analyzed
Expand Down Expand Up @@ -64,7 +64,7 @@
#' revision_example %>% arrange(desc(spread))
#' @export
#' @importFrom cli cli_inform cli_abort cli_li
#' @importFrom rlang list2 syms
#' @importFrom rlang list2 syms dots_n
#' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across
#' everything ungroup summarize if_else %>%
revision_summary <- function(epi_arch,
Expand All @@ -80,12 +80,19 @@ revision_summary <- function(epi_arch,
compactify_tol = .Machine$double.eps^0.5,
should_compactify = TRUE) {
assert_class(epi_arch, "epi_archive")
arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
if (length(arg) == 0) {
# Choose the first column that's not a key or version
if (dots_n(...) == 0) {
# Choose the first column that's not a key:
arg <- setdiff(names(epi_arch$DT), key_colnames(epi_arch))[[1]]
} else if (length(arg) > 1) {
cli_abort("Not currently implementing more than one column at a time. Run each separately")
} else {
arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
if (length(arg) == 0) {
cli_abort("Could not find any columns matching the selection in `...`.",
class = "epiprocess__revision_summary__selected_zero_columns"
)
}
if (length(arg) > 1) {
cli_abort("Not currently implementing more than one column at a time. Run each separately.")
}
}
if (is.null(abs_spread_threshold)) {
abs_spread_threshold <- .05 * epi_arch$DT %>%
Expand All @@ -99,10 +106,12 @@ revision_summary <- function(epi_arch,
# the max lag
#
# revision_tibble
keys <- key_colnames(epi_arch, exclude = "version")
epikey_names <- key_colnames(epi_arch, exclude = c("time_value", "version"))
epikeytime_names <- c(epikey_names, "time_value")
keys <- c(epikeytime_names, "version")

revision_behavior <- epi_arch$DT %>%
select(all_of(unique(c("geo_value", "time_value", keys, "version", arg))))
select(all_of(unique(c(keys, arg))))
if (!is.null(min_waiting_period)) {
revision_behavior <- revision_behavior %>%
filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period)
Expand All @@ -112,27 +121,26 @@ revision_summary <- function(epi_arch,
# if we're dropping NA's, we should recompactify
revision_behavior <-
revision_behavior %>%
filter(!is.na(c_across(!!arg)))
filter(!is.na(.data[[arg]]))
} else {
revision_behavior <- epi_arch$DT
}
if (should_compactify) {
revision_behavior <- revision_behavior %>%
arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
apply_compactify(c(keys, version), compactify_tol)
apply_compactify(keys, compactify_tol)
}
revision_behavior <-
revision_behavior %>%
mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter
group_by(across(all_of(keys))) %>% # group by all the keys
group_by(across(all_of(epikeytime_names))) %>% # group = versions of one measurement
summarize(
n_revisions = dplyr::n() - 1,
min_lag = min(lag), # nolint: object_usage_linter
max_lag = max(lag), # nolint: object_usage_linter
min_value = f_no_na(min, pick(!!arg)),
max_value = f_no_na(max, pick(!!arg)),
median_value = f_no_na(median, pick(!!arg)),
time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter
min_value = f_no_na(min, .data[[arg]]),
max_value = f_no_na(max, .data[[arg]]),
median_value = f_no_na(median, .data[[arg]]),
time_to = time_within_x_latest(lag, .data[[arg]], prop = within_latest),
.groups = "drop"
) %>%
mutate(
Expand All @@ -145,7 +153,7 @@ revision_summary <- function(epi_arch,
) %>%
select(-time_to) %>%
relocate(
time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter
time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter
time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter
)
if (print_inform) {
Expand Down Expand Up @@ -203,10 +211,9 @@ revision_summary <- function(epi_arch,
}

#' pull the value from lags when values starts indefinitely being within prop of it's last value.
#' @param values this should be a 1 column tibble. errors may occur otherwise
#' @param values this should be a vector (e.g., a column). errors may occur otherwise
#' @keywords internal
time_within_x_latest <- function(lags, values, prop = .2) {
values <- values[[1]]
latest_value <- values[[length(values)]]
close_enough <- abs(values - latest_value) < prop * latest_value
# we want to ignore any stretches where it's close, but goes farther away later
Expand All @@ -222,7 +229,6 @@ time_within_x_latest <- function(lags, values, prop = .2) {
#' @keywords internal
get_last_run <- function(bool_vec, values_from) {
runs <- rle(bool_vec)
length(bool_vec) - tail(runs$lengths, n = 1)
values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]]
}

Expand Down
2 changes: 1 addition & 1 deletion man/revision_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/time_within_x_latest.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 23 additions & 16 deletions tests/testthat/test-revision-latency-functions.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,5 @@
dummy_ex <- tibble::tribble(
~geo_value, ~time_value, ~version, ~value,
# al 1 has 1 real revision, a lag of 0, and changes by 99
"al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1,
"al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1,
"al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100,
# al 2 has no revision, a min lag of 0, and a rel_spread of 0
"al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1,
# al 3 has 1 revision and a min lag of 1, and a change of 3
"al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1,
"al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4,
# al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0
"al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA,
"al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9,
# ak 1 has 4 revisions w/out NAs, but 6 with NAs
# a min lag of 2, and a change of 101
"ak", as.Date("2020-01-01"), as.Date("2020-01-03"), 1,
Expand All @@ -27,6 +15,18 @@ dummy_ex <- tibble::tribble(
# ak 3 has 0 revisions, and a value of zero, and thus a rel_spread of NaN
"ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0,
"ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0,
# al 1 has 1 real revision, a lag of 0, and changes by 99
"al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1,
"al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1,
"al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100,
# al 2 has no revision, a min lag of 0, and a rel_spread of 0
"al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1,
# al 3 has 1 revision and a min lag of 1, and a change of 3
"al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1,
"al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4,
# al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0
"al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA,
"al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9,
) %>%
as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE)

Expand All @@ -40,12 +40,19 @@ test_that("tidyselect is functional", {
with_later_key_col <- dummy_ex$DT %>%
select(geo_value, time_value, value, version) %>%
as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE)
expect_equal(quiet(revision_summary(with_later_key_col)),
quiet(revision_summary(dummy_ex)))
expect_equal(
quiet(revision_summary(with_later_key_col)),
quiet(revision_summary(dummy_ex))
)
with_later_val_col <- dummy_ex$DT %>%
mutate(value2 = 0) %>%
as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE)
expect_equal(quiet(revision_summary(with_later_val_col, value)),
quiet(revision_summary(dummy_ex, value)))
expect_equal(
quiet(revision_summary(with_later_val_col, value)),
quiet(revision_summary(dummy_ex, value))
)
expect_error(revision_summary(with_later_val_col, !everything()),
class = "epiprocess__revision_summary__selected_zero_columns"
)
})
test_that("revision_summary works for various timetypes", {})

0 comments on commit f9b85c4

Please sign in to comment.