diff --git a/DESCRIPTION b/DESCRIPTION index 456e7a5e..c2b1b98e 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.9.4 +Version: 0.9.6 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index 904b2d24..ffce4c50 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,9 +38,9 @@ S3method(guess_period,Date) S3method(guess_period,POSIXt) S3method(guess_period,default) S3method(key_colnames,data.frame) -S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) +S3method(key_colnames,tbl_ts) S3method(mean,epi_df) S3method(next_after,Date) S3method(next_after,integer) @@ -99,12 +99,14 @@ export(time_column_names) export(ungroup) export(unnest) export(validate_epi_archive) +export(vec_position_lag) export(version_column_names) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) importFrom(checkmate,assert_character) importFrom(checkmate,assert_class) +importFrom(checkmate,assert_count) importFrom(checkmate,assert_data_frame) importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) @@ -191,6 +193,8 @@ importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) +importFrom(rlang,check_dots_empty0) +importFrom(rlang,dots_n) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) @@ -230,3 +234,8 @@ importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) +importFrom(vctrs,obj_check_vector) +importFrom(vctrs,vec_c) +importFrom(vctrs,vec_cast) +importFrom(vctrs,vec_size) +importFrom(vctrs,vec_slice) diff --git a/R/archive.R b/R/archive.R index d8102165..9e921b7c 100644 --- a/R/archive.R +++ b/R/archive.R @@ -385,6 +385,23 @@ removed_by_compactify <- function(df, keys, tolerance) { )) # nolint: object_usage_linter } +#' Lag entries in a vctrs-style vector by their position in the vector +#' +#' @importFrom checkmate assert_count +#' @importFrom vctrs obj_check_vector vec_slice vec_size +#' @keywords internal +#' @importFrom vctrs vec_c vec_slice vec_size +#' @export +vec_position_lag <- function(x, n) { + # obj_check_vector(x) + assert_count(n) + if (length(x) == 0L) { + x + } else { + vec_c(rep(NA, n), vec_slice(x, seq_len(vec_size(x) - 1L))) + } +} + #' Checks to see if a value in a vector is LOCF #' @description #' LOCF meaning last observation carried forward. lags the vector by 1, then @@ -394,8 +411,8 @@ removed_by_compactify <- function(df, keys, tolerance) { #' @importFrom dplyr lag if_else near #' @keywords internal is_locf <- function(vec, tolerance) { # nolint: object_usage_linter - lag_vec <- dplyr::lag(vec) - if (typeof(vec) == "double") { + lag_vec <- vec_position_lag(vec, 1L) + if (inherits(vec, "numeric")) { # (no matrix/array/general support) res <- if_else( !is.na(vec) & !is.na(lag_vec), near(vec, lag_vec, tol = tolerance), diff --git a/R/key_colnames.R b/R/key_colnames.R index 49c32674..3d700975 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -1,47 +1,134 @@ -#' Grab any keys associated to an epi_df +#' Get names of columns that form a (unique) key associated with an object #' -#' @param x a data.frame, tibble, or epi_df +#' This is entirely based on metadata and arguments passed; there are no +#' explicit checks that the key actually is unique in any associated data +#' structures. +#' +#' @param x an object, such as an [`epi_df`] #' @param ... additional arguments passed on to methods -#' @param other_keys an optional character vector of other keys to include -#' @param exclude an optional character vector of keys to exclude -#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`. +#' @param geo_keys optional character vector; which columns (if any) to consider +#' keys specifying the geographical region? Defaults to `"geo_value"` if +#' present; must be `"geo_value"` if `x` is an `epi_df`. +#' @param other_keys character vector; which columns (if any) to consider keys +#' specifying demographical or identifying/grouping information besides the +#' geographical region and time interval? Mandatory if `x` is a vanilla +#' `data.frame` or `tibble`. Optional if `x` is an `epi_df`; default is the +#' `epi_df`'s `other_keys`; if you provide `other_keys`, they must match the +#' default. (This behavior is to enable consistent and sane results when you +#' can't guarantee whether `x` is an `epi_df` or just a +#' `tibble`/`data.frame`.) +#' @param time_keys optional character vector; which columns (if any) to +#' consider keys specifying the time interval during which associated events +#' occurred? Defaults to `"time_value"` if present; must be `"time_value"` if +#' `x` is an `epi_df`. +#' @param exclude an optional character vector of key column names to exclude +#' from the result +#' @return character vector #' @keywords internal #' @export -key_colnames <- function(x, ...) { - UseMethod("key_colnames") -} - -#' @rdname key_colnames -#' @method key_colnames default -#' @export -key_colnames.default <- function(x, ...) { - character(0L) +key_colnames <- function(x, ..., exclude = character()) { + provided_args <- rlang::call_args_names(rlang::call_match()) + if ("extra_keys" %in% provided_args) { + lifecycle::deprecate_soft("0.9.6", "key_colnames(extra_keys=)", "key_colnames(other_keys=)") + redispatch <- function(..., extra_keys) { + key_colnames(..., other_keys = extra_keys) + } + redispatch(x, ..., exclude = exclude) + } else { + UseMethod("key_colnames") + } } #' @rdname key_colnames +#' @importFrom rlang check_dots_empty0 #' @method key_colnames data.frame #' @export -key_colnames.data.frame <- function(x, other_keys = character(0L), exclude = character(0L), ...) { +key_colnames.data.frame <- function(x, ..., + geo_keys = intersect("geo_value", names(x)), + other_keys, + time_keys = intersect("time_value", names(x)), + exclude = character()) { + check_dots_empty0(...) + assert_character(geo_keys) + assert_character(time_keys) assert_character(other_keys) assert_character(exclude) - nm <- setdiff(c("geo_value", other_keys, "time_value"), exclude) - intersect(nm, colnames(x)) + keys <- c(geo_keys, other_keys, time_keys) + if (!all(keys %in% names(x))) { + cli_abort(c( + "Some of the specified key columns aren't present in `x`", + "i" = "Specified keys: {format_varnames(keys)}", + "i" = "Columns of x: {format_varnames(names(x))}", + "x" = "Missing keys: {format_varnames(setdiff(keys, names(x)))}" + ), class = "epiprocess__key_colnames__keys_not_in_colnames") + } + setdiff(keys, exclude) } #' @rdname key_colnames #' @method key_colnames epi_df #' @export -key_colnames.epi_df <- function(x, exclude = character(0L), ...) { +key_colnames.epi_df <- function(x, ..., + geo_keys = "geo_value", + other_keys = NULL, + time_keys = "time_value", + exclude = character()) { + check_dots_empty0(...) + if (!identical(geo_keys, "geo_value")) { + cli_abort('If `x` is an `epi_df`, then `geo_keys` must be `"geo_value"`', + class = "epiprocess__key_colnames__mismatched_geo_keys" + ) + } + if (!identical(time_keys, "time_value")) { + cli_abort('If `x` is an `epi_df`, then `time_keys` must be `"time_value"`', + class = "epiprocess__key_colnames__mismatched_time_keys" + ) + } + expected_other_keys <- attr(x, "metadata")$other_keys + if (is.null(other_keys)) { + other_keys <- expected_other_keys + } else { + if (!identical(other_keys, expected_other_keys)) { + cli_abort(c( + "The provided `other_keys` argument didn't match the `other_keys` of `x`", + "*" = "`other_keys` was {format_chr_with_quotes(other_keys)}", + "*" = "`expected_other_keys` was {format_chr_with_quotes(expected_other_keys)}", + "i" = "If you know that `x` will always be an `epi_df` and + resolve this discrepancy by adjusting the metadata of `x`, you + shouldn't have to pass `other_keys =` here anymore, + unless you want to continue to perform this check." + ), class = "epiprocess__key_colnames__mismatched_other_keys") + } + } assert_character(exclude) - other_keys <- attr(x, "metadata")$other_keys setdiff(c("geo_value", other_keys, "time_value"), exclude) } +#' @rdname key_colnames +#' @method key_colnames tbl_ts +#' @export +key_colnames.tbl_ts <- function(x, ..., exclude = character()) { + check_dots_empty0(...) + assert_character(exclude) + idx <- tsibble::index_var(x) + idx2 <- tsibble::index2_var(x) + if (!identical(idx, idx2)) { + cli_abort(c( + "`x` is in the middle of a re-indexing operation with `index_by()`; it's unclear + whether we should output the old unique key or the new unique key-to-be", + "i" = "Old index: {format_varname(idx)}", + "i" = "Pending new index: {format_varname(idx2)}", + "Please complete (e.g., with `summarise()`) or remove the re-indexing operation." + ), class = "epiprocess__key_colnames__incomplete_reindexing_operation") + } + setdiff(c(tsibble::key_vars(x), idx), exclude) +} + #' @rdname key_colnames #' @method key_colnames epi_archive #' @export -key_colnames.epi_archive <- function(x, exclude = character(0L), ...) { +key_colnames.epi_archive <- function(x, ..., exclude = character()) { + check_dots_empty0(...) assert_character(exclude) - other_keys <- attr(x, "metadata")$other_keys - setdiff(c("geo_value", other_keys, "time_value"), exclude) + setdiff(c("geo_value", x$other_keys, "time_value", "version"), exclude) } diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 7be0cd24..da3db933 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -17,11 +17,11 @@ #' 8. `rel_spread`: `spread` divided by the largest value (so it will #' always be less than 1). Note that this need not be the final value. It will #' be `NA` whenever `spread` is 0. -#' 9. `time_near_latest`: This gives the lag when the value is within -#' `within_latest` (default 20%) of the value at the latest time. For example, -#' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is -#' the 5th index, since even though 99 is within 20%, it is outside the window -#' afterwards at 150. +#' 9. `lag_near_latest`: This gives the lag when the value is within and +#' remains `within_latest` (default 20%) of the value at the latest time. For +#' example, consider the series (0, 20, 99, 150, 102, 100); then +#' `lag_near_latest` is the 5th index, since even though 99 is within 20%, it +#' is outside the window afterwards at 150. #' @param epi_arch an epi_archive to be analyzed #' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to #' summarize. If empty, it chooses the first. Currently only implemented for @@ -39,7 +39,7 @@ #' final value for case counts as reported in the context of insurance. To #' avoid this filtering, either set to `NULL` or 0. #' @param within_latest double between 0 and 1. Determines the threshold -#' used for the `time_to` +#' used for the `lag_to` #' @param quick_revision difftime or integer (integer is treated as days), for #' the printed summary, the amount of time between the final revision and the #' actual time_value to consider the revision quickly resolved. Default of 3 @@ -64,7 +64,8 @@ #' revision_example %>% arrange(desc(spread)) #' @export #' @importFrom cli cli_inform cli_abort cli_li -#' @importFrom rlang list2 syms +#' @importFrom rlang list2 syms dots_n +#' @importFrom vctrs vec_cast #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across #' everything ungroup summarize if_else %>% revision_summary <- function(epi_arch, @@ -79,16 +80,24 @@ revision_summary <- function(epi_arch, rel_spread_threshold = 0.1, compactify_tol = .Machine$double.eps^0.5, should_compactify = TRUE) { - arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) - if (length(arg) == 0) { - # Choose the first column that's not a key or version - arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]] - } else if (length(arg) > 1) { - cli_abort("Not currently implementing more than one column at a time. Run each separately") + assert_class(epi_arch, "epi_archive") + if (dots_n(...) == 0) { + # Choose the first column that's not a key: + arg <- setdiff(names(epi_arch$DT), key_colnames(epi_arch))[[1]] + } else { + arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) + if (length(arg) == 0) { + cli_abort("Could not find any columns matching the selection in `...`.", + class = "epiprocess__revision_summary__selected_zero_columns" + ) + } + if (length(arg) > 1) { + cli_abort("Not currently implementing more than one column at a time. Run each separately.") + } } if (is.null(abs_spread_threshold)) { abs_spread_threshold <- .05 * epi_arch$DT %>% - pull(...) %>% + pull(!!arg) %>% max(na.rm = TRUE) } # for each time_value, get @@ -98,40 +107,41 @@ revision_summary <- function(epi_arch, # the max lag # # revision_tibble - keys <- key_colnames(epi_arch) + epikey_names <- key_colnames(epi_arch, exclude = c("time_value", "version")) + epikeytime_names <- c(epikey_names, "time_value") + keys <- c(epikeytime_names, "version") revision_behavior <- epi_arch$DT %>% - select(all_of(unique(c("geo_value", "time_value", keys, "version", arg)))) + select(all_of(unique(c(keys, arg)))) if (!is.null(min_waiting_period)) { revision_behavior <- revision_behavior %>% - filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period) + filter(vec_cast(epi_arch$versions_end - time_value, min_waiting_period) >= min_waiting_period) } if (drop_nas) { # if we're dropping NA's, we should recompactify revision_behavior <- revision_behavior %>% - filter(!is.na(c_across(!!arg))) + filter(!is.na(.data[[arg]])) } else { revision_behavior <- epi_arch$DT } if (should_compactify) { revision_behavior <- revision_behavior %>% - arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying - apply_compactify(c(keys, version), compactify_tol) + apply_compactify(keys, compactify_tol) } revision_behavior <- revision_behavior %>% mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter - group_by(across(all_of(keys))) %>% # group by all the keys + group_by(across(all_of(epikeytime_names))) %>% # group = versions of one measurement summarize( n_revisions = dplyr::n() - 1, min_lag = min(lag), # nolint: object_usage_linter max_lag = max(lag), # nolint: object_usage_linter - min_value = f_no_na(min, pick(!!arg)), - max_value = f_no_na(max, pick(!!arg)), - median_value = f_no_na(median, pick(!!arg)), - time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter + min_value = f_no_na(min, .data[[arg]]), + max_value = f_no_na(max, .data[[arg]]), + median_value = f_no_na(median, .data[[arg]]), + lag_to = lag_within_x_latest(lag, .data[[arg]], prop = within_latest), .groups = "drop" ) %>% mutate( @@ -140,12 +150,12 @@ revision_summary <- function(epi_arch, # TODO the units here may be a problem min_lag = as.difftime(min_lag, units = "days"), # nolint: object_usage_linter max_lag = as.difftime(max_lag, units = "days"), # nolint: object_usage_linter - time_near_latest = as.difftime(time_to, units = "days") # nolint: object_usage_linter + lag_near_latest = as.difftime(lag_to, units = "days") # nolint: object_usage_linter ) %>% - select(-time_to) %>% + select(-lag_to) %>% relocate( - time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter - time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter + time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter + lag_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter ) if (print_inform) { cli_inform("Min lag (time to first version):") @@ -196,16 +206,17 @@ revision_summary <- function(epi_arch, cli_li(num_percent(abs_spread, n_real_revised, "")) cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:") - difftime_summary(revision_behavior[["time_near_latest"]]) %>% print() + difftime_summary(revision_behavior[["lag_near_latest"]]) %>% print() } return(revision_behavior) } -#' pull the value from lags when values starts indefinitely being within prop of it's last value. -#' @param values this should be a 1 column tibble. errors may occur otherwise +#' pull the value from lags when values starts indefinitely being within prop of its latest value. +#' @param lags vector of lags; should be sorted +#' @param values this should be a vector (e.g., a column) with length matching that of `lags` +#' @param prop optional length-1 double; proportion #' @keywords internal -time_within_x_latest <- function(lags, values, prop = .2) { - values <- values[[1]] +lag_within_x_latest <- function(lags, values, prop = .2) { latest_value <- values[[length(values)]] close_enough <- abs(values - latest_value) < prop * latest_value # we want to ignore any stretches where it's close, but goes farther away later @@ -221,11 +232,10 @@ time_within_x_latest <- function(lags, values, prop = .2) { #' @keywords internal get_last_run <- function(bool_vec, values_from) { runs <- rle(bool_vec) - length(bool_vec) - tail(runs$lengths, n = 1) values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]] } -#' use when the default behavior returns a warning on empty lists, which we do +#' use when the default behavior returns a warning on empty vectors, which we do #' not want, and there is no super clean way of preventing this #' @keywords internal f_no_na <- function(f, x) { diff --git a/man/f_no_na.Rd b/man/f_no_na.Rd index 9a832d72..1e3acb6f 100644 --- a/man/f_no_na.Rd +++ b/man/f_no_na.Rd @@ -2,13 +2,13 @@ % Please edit documentation in R/revision_analysis.R \name{f_no_na} \alias{f_no_na} -\title{use when the default behavior returns a warning on empty lists, which we do +\title{use when the default behavior returns a warning on empty vectors, which we do not want, and there is no super clean way of preventing this} \usage{ f_no_na(f, x) } \description{ -use when the default behavior returns a warning on empty lists, which we do +use when the default behavior returns a warning on empty vectors, which we do not want, and there is no super clean way of preventing this } \keyword{internal} diff --git a/man/key_colnames.Rd b/man/key_colnames.Rd index f5e13837..925601a6 100644 --- a/man/key_colnames.Rd +++ b/man/key_colnames.Rd @@ -2,35 +2,68 @@ % Please edit documentation in R/key_colnames.R \name{key_colnames} \alias{key_colnames} -\alias{key_colnames.default} \alias{key_colnames.data.frame} \alias{key_colnames.epi_df} +\alias{key_colnames.tbl_ts} \alias{key_colnames.epi_archive} -\title{Grab any keys associated to an epi_df} +\title{Get names of columns that form a (unique) key associated with an object} \usage{ -key_colnames(x, ...) +key_colnames(x, ..., exclude = character()) -\method{key_colnames}{default}(x, ...) +\method{key_colnames}{data.frame}( + x, + ..., + geo_keys = intersect("geo_value", names(x)), + other_keys, + time_keys = intersect("time_value", names(x)), + exclude = character() +) -\method{key_colnames}{data.frame}(x, other_keys = character(0L), exclude = character(0L), ...) +\method{key_colnames}{epi_df}( + x, + ..., + geo_keys = "geo_value", + other_keys = NULL, + time_keys = "time_value", + exclude = character() +) -\method{key_colnames}{epi_df}(x, exclude = character(0L), ...) +\method{key_colnames}{tbl_ts}(x, ..., exclude = character()) -\method{key_colnames}{epi_archive}(x, exclude = character(0L), ...) +\method{key_colnames}{epi_archive}(x, ..., exclude = character()) } \arguments{ -\item{x}{a data.frame, tibble, or epi_df} +\item{x}{an object, such as an \code{\link{epi_df}}} \item{...}{additional arguments passed on to methods} -\item{other_keys}{an optional character vector of other keys to include} +\item{exclude}{an optional character vector of key column names to exclude +from the result} -\item{exclude}{an optional character vector of keys to exclude} +\item{geo_keys}{optional character vector; which columns (if any) to consider +keys specifying the geographical region? Defaults to \code{"geo_value"} if +present; must be \code{"geo_value"} if \code{x} is an \code{epi_df}.} + +\item{other_keys}{character vector; which columns (if any) to consider keys +specifying demographical or identifying/grouping information besides the +geographical region and time interval? Mandatory if \code{x} is a vanilla +\code{data.frame} or \code{tibble}. Optional if \code{x} is an \code{epi_df}; default is the +\code{epi_df}'s \code{other_keys}; if you provide \code{other_keys}, they must match the +default. (This behavior is to enable consistent and sane results when you +can't guarantee whether \code{x} is an \code{epi_df} or just a +\code{tibble}/\code{data.frame}.)} + +\item{time_keys}{optional character vector; which columns (if any) to +consider keys specifying the time interval during which associated events +occurred? Defaults to \code{"time_value"} if present; must be \code{"time_value"} if +\code{x} is an \code{epi_df}.} } \value{ -If an \code{epi_df}, this returns all "keys". Otherwise \code{NULL}. +character vector } \description{ -Grab any keys associated to an epi_df +This is entirely based on metadata and arguments passed; there are no +explicit checks that the key actually is unique in any associated data +structures. } \keyword{internal} diff --git a/man/lag_within_x_latest.Rd b/man/lag_within_x_latest.Rd new file mode 100644 index 00000000..9c90fd8c --- /dev/null +++ b/man/lag_within_x_latest.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{lag_within_x_latest} +\alias{lag_within_x_latest} +\title{pull the value from lags when values starts indefinitely being within prop of its latest value.} +\usage{ +lag_within_x_latest(lags, values, prop = 0.2) +} +\arguments{ +\item{lags}{vector of lags; should be sorted} + +\item{values}{this should be a vector (e.g., a column) with length matching that of \code{lags}} + +\item{prop}{optional length-1 double; proportion} +} +\description{ +pull the value from lags when values starts indefinitely being within prop of its latest value. +} +\keyword{internal} diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd index 590a1ed5..20c0cd02 100644 --- a/man/revision_summary.Rd +++ b/man/revision_summary.Rd @@ -42,7 +42,7 @@ final value for case counts as reported in the context of insurance. To avoid this filtering, either set to \code{NULL} or 0.} \item{within_latest}{double between 0 and 1. Determines the threshold -used for the \code{time_to}} +used for the \code{lag_to}} \item{quick_revision}{difftime or integer (integer is treated as days), for the printed summary, the amount of time between the final revision and the @@ -86,11 +86,11 @@ always excludes \code{NA} values) \item \code{rel_spread}: \code{spread} divided by the largest value (so it will always be less than 1). Note that this need not be the final value. It will be \code{NA} whenever \code{spread} is 0. -\item \code{time_near_latest}: This gives the lag when the value is within -\code{within_latest} (default 20\%) of the value at the latest time. For example, -consider the series (0,20, 99, 150, 102, 100); then \code{time_near_latest} is -the 5th index, since even though 99 is within 20\%, it is outside the window -afterwards at 150. +\item \code{lag_near_latest}: This gives the lag when the value is within and +remains \code{within_latest} (default 20\%) of the value at the latest time. For +example, consider the series (0, 20, 99, 150, 102, 100); then +\code{lag_near_latest} is the 5th index, since even though 99 is within 20\%, it +is outside the window afterwards at 150. } } \examples{ diff --git a/man/time_within_x_latest.Rd b/man/time_within_x_latest.Rd deleted file mode 100644 index 1dd7e801..00000000 --- a/man/time_within_x_latest.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/revision_analysis.R -\name{time_within_x_latest} -\alias{time_within_x_latest} -\title{pull the value from lags when values starts indefinitely being within prop of it's last value.} -\usage{ -time_within_x_latest(lags, values, prop = 0.2) -} -\arguments{ -\item{values}{this should be a 1 column tibble. errors may occur otherwise} -} -\description{ -pull the value from lags when values starts indefinitely being within prop of it's last value. -} -\keyword{internal} diff --git a/man/vec_position_lag.Rd b/man/vec_position_lag.Rd new file mode 100644 index 00000000..3e828455 --- /dev/null +++ b/man/vec_position_lag.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{vec_position_lag} +\alias{vec_position_lag} +\title{Lag entries in a vctrs-style vector by their position in the vector} +\usage{ +vec_position_lag(x, n) +} +\description{ +Lag entries in a vctrs-style vector by their position in the vector +} +\keyword{internal} diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index 1ac21469..27cc8595 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -25,15 +25,15 @@ min median mean max 0 days 3 days 6.9 days 19 days # A tibble: 7 x 11 - time_value geo_value n_revisions min_lag max_lag time_near_latest spread - - 1 2020-01-01 ak 4 2 days 19 days 19 days 101 - 2 2020-01-02 ak 1 4 days 5 days 4 days 9 - 3 2020-01-03 ak 0 3 days 3 days 3 days 0 - 4 2020-01-01 al 1 0 days 19 days 19 days 99 - 5 2020-01-02 al 0 0 days 0 days 0 days 0 - 6 2020-01-03 al 1 1 days 2 days 2 days 3 - 7 2020-01-04 al 0 1 days 1 days 1 days 0 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 4 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 0 1 days 1 days 1 days 0 rel_spread min_value max_value median_value 1 0.990 1 102 6 @@ -73,15 +73,15 @@ min median mean max 0 days 3 days 6.9 days 19 days # A tibble: 7 x 11 - time_value geo_value n_revisions min_lag max_lag time_near_latest spread - - 1 2020-01-01 ak 6 2 days 19 days 19 days 101 - 2 2020-01-02 ak 1 4 days 5 days 4 days 9 - 3 2020-01-03 ak 0 3 days 3 days 3 days 0 - 4 2020-01-01 al 1 0 days 19 days 19 days 99 - 5 2020-01-02 al 0 0 days 0 days 0 days 0 - 6 2020-01-03 al 1 1 days 2 days 2 days 3 - 7 2020-01-04 al 1 0 days 1 days 1 days 0 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 6 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 1 0 days 1 days 1 days 0 rel_spread min_value max_value median_value 1 0.990 1 102 5.5 diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R new file mode 100644 index 00000000..86ae1c99 --- /dev/null +++ b/tests/testthat/test-key_colnames.R @@ -0,0 +1,170 @@ +test_that("`key_colnames` on non-`epi_df`-like tibbles works as expected", { + withr::local_options(list(lifecycle_verbosity = "warning")) # for extra_keys tests + + k1k2_tbl <- tibble::tibble(k1 = 1, k2 = 1) + + expect_equal( + key_colnames(k1k2_tbl, geo_keys = character(0L), time_keys = character(0L), other_keys = c("k1", "k2")), + c("k1", "k2") + ) + # `geo_keys` and `time_keys` are optional, and, in this case, inferred to be absent: + expect_equal( + key_colnames(k1k2_tbl, other_keys = c("k1", "k2")), + c("k1", "k2") + ) + # but `other_keys` is mandatory: + expect_error( + key_colnames(k1k2_tbl) + ) + + # Manually specifying keys that aren't there is an error: + expect_error( + key_colnames(k1k2_tbl, geo_keys = "bogus", other_keys = c("k1", "k2")), + class = "epiprocess__key_colnames__keys_not_in_colnames" + ) + expect_error( + key_colnames(k1k2_tbl, time_keys = "bogus", other_keys = c("k1", "k2")), + class = "epiprocess__key_colnames__keys_not_in_colnames" + ) + expect_error( + key_colnames(k1k2_tbl, other_keys = "bogus"), + class = "epiprocess__key_colnames__keys_not_in_colnames" + ) + + # We can specify non-`epi_df`-like geo keys: + expect_equal( + key_colnames(k1k2_tbl, geo_keys = c("k1", "k2"), other_keys = character(0L)), + c("k1", "k2") + ) +}) + +test_that("`key_colnames` on `epi_df`s and similar tibbles works as expected", { + gat_tbl <- tibble::tibble(geo_value = 1, age_group = 1, time_value = 1) + gat_edf <- as_epi_df(gat_tbl, other_keys = "age_group", as_of = 2) + + # For tbl: `geo_keys` and `time_keys` are optional, and, in this case, + # inferred to be (just) `geo_value` and (just) `time_value`: + expect_equal( + key_colnames(gat_tbl, other_keys = "age_group"), + c("geo_value", "age_group", "time_value") + ) + # and edfs give something compatible: + expect_equal( + key_colnames(gat_edf, other_keys = "age_group"), + c("geo_value", "age_group", "time_value") + ) + # though edfs don't have to specify the `other_keys`: + expect_equal( + key_colnames(gat_edf), + c("geo_value", "age_group", "time_value") + ) + # and they will balk if we write something intended to work for both tbls and + # edfs but mis-specify the `other_keys`: + expect_error( + key_colnames(gat_edf, other_keys = character(0L)), + class = "epiprocess__key_colnames__mismatched_other_keys" + ) + + # edfs also won't let us specify nonstandard geotime keys: + expect_error( + key_colnames(gat_edf, geo_keys = "time_value"), + class = "epiprocess__key_colnames__mismatched_geo_keys" + ) + expect_error( + key_colnames(gat_edf, time_keys = "geo_value"), + class = "epiprocess__key_colnames__mismatched_time_keys" + ) + + # We can exclude keys: + expect_equal( + key_colnames(gat_tbl, other_keys = "age_group", exclude = c("time_value")), + c("geo_value", "age_group") + ) + expect_equal( + key_colnames(gat_tbl, other_keys = "age_group", exclude = c("geo_value", "time_value")), + c("age_group") + ) + expect_equal( + key_colnames(gat_edf, exclude = c("time_value")), + c("geo_value", "age_group") + ) + expect_equal( + key_colnames(gat_edf, exclude = c("geo_value", "time_value")), + c("age_group") + ) + + # Using `extra_keys =` is soft-deprecated and routes to `other_keys =`: + expect_warning( + gat_tbl_extra_keys_res <- key_colnames(gat_tbl, extra_keys = "age_group"), + class = "lifecycle_warning_deprecated" + ) + expect_equal(gat_tbl_extra_keys_res, c("geo_value", "age_group", "time_value")) + + expect_warning( + gat_edf_extra_keys_exclude_res <- + key_colnames(gat_edf, extra_keys = "age_group", exclude = c("geo_value", "time_value")), + class = "lifecycle_warning_deprecated" + ) + expect_equal(gat_edf_extra_keys_exclude_res, c("age_group")) +}) + +test_that("`key_colnames` on tsibbles works as expected", { + k1k2i_tsbl <- tsibble::tsibble(k1 = 1, k2 = 1, i = 1, key = c(k1, k2), index = i) + + # Normal operation: + expect_equal(key_colnames(k1k2i_tsbl), c("k1", "k2", "i")) + + # Currently there is just bare-bones support for tsibbles to not output + # incompatible results based on `data.frame` inheritance: + expect_error( + key_colnames(k1k2i_tsbl, geo_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(k1k2i_tsbl, time_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(k1k2i_tsbl, other_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + + # We guard against confusing cases: + expect_error( + key_colnames(k1k2i_tsbl %>% tsibble::index_by(fake_coarser_i = i)), + class = "epiprocess__key_colnames__incomplete_reindexing_operation" + ) +}) + +test_that("`key_colnames` on `epi_archive`s works as expected", { + gatv_ea <- tibble(geo_value = 1, age_group = 1, time_value = 1, version = 2) %>% + as_epi_archive(other_keys = "age_group") + + # Basic operation: + expect_equal( + key_colnames(gatv_ea), + c("geo_value", "age_group", "time_value", "version") + ) + + # Since we shouldn't have uncertainty about whether we might have an archive + # or not, there's no need to provide compatibility with the key specification + # args: + expect_error( + key_colnames(gatv_ea, geo_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(gatv_ea, time_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + expect_error( + key_colnames(gatv_ea, other_keys = "k1"), + class = "rlib_error_dots_nonempty" + ) + + # Key exclusion works: + expect_equal( + key_colnames(gatv_ea, exclude = c("version", "time_value")), + c("geo_value", "age_group") + ) +}) diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index ff722068..129deba5 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -1,17 +1,5 @@ dummy_ex <- tibble::tribble( ~geo_value, ~time_value, ~version, ~value, - # al 1 has 1 real revision, a lag of 0, and changes by 99 - "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, - # al 2 has no revision, a min lag of 0, and a rel_spread of 0 - "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, - # al 3 has 1 revision and a min lag of 1, and a change of 3 - "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, - "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, - # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 - "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, - "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, # ak 1 has 4 revisions w/out NAs, but 6 with NAs # a min lag of 2, and a change of 101 "ak", as.Date("2020-01-01"), as.Date("2020-01-03"), 1, @@ -27,6 +15,18 @@ dummy_ex <- tibble::tribble( # ak 3 has 0 revisions, and a value of zero, and thus a rel_spread of NaN "ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0, "ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0, + # al 1 has 1 real revision, a lag of 0, and changes by 99 + "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, + # al 2 has no revision, a min lag of 0, and a rel_spread of 0 + "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, + # al 3 has 1 revision and a min lag of 1, and a change of 3 + "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, + "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, + # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 + "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, + "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, ) %>% as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE) @@ -35,7 +35,24 @@ test_that("revision_summary works for a dummy dataset", { expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) }) test_that("tidyselect is functional", { - expect_no_error(revision_summary(dummy_ex, value)) - expect_no_error(revision_summary(dummy_ex, starts_with("val"))) + expect_no_error(quiet(revision_summary(dummy_ex, value))) + expect_no_error(quiet(revision_summary(dummy_ex, starts_with("val")))) + with_later_key_col <- dummy_ex$DT %>% + select(geo_value, time_value, value, version) %>% + as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) + expect_equal( + quiet(revision_summary(with_later_key_col)), + quiet(revision_summary(dummy_ex)) + ) + with_later_val_col <- dummy_ex$DT %>% + mutate(value2 = 0) %>% + as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) + expect_equal( + quiet(revision_summary(with_later_val_col, value)), + quiet(revision_summary(dummy_ex, value)) + ) + expect_error(revision_summary(with_later_val_col, !everything()), + class = "epiprocess__revision_summary__selected_zero_columns" + ) }) test_that("revision_summary works for various timetypes", {}) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 62eea2aa..056a8253 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -152,7 +152,7 @@ revision_details %>% max_lag = max(max_lag), spread = mean(spread), rel_spread = mean(rel_spread), - time_near_latest = mean(time_near_latest) + lag_near_latest = mean(lag_near_latest) ) ```