wip doc: clean up reference, reorganize notebooks

cmu-delphi · Oct 14, 2024 · c8ca1df · c8ca1df
1 parent cb7c1ef
commit c8ca1df
Show file tree

Hide file tree

Showing 44 changed files with 1,247 additions and 1,108 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -42,8 +42,6 @@ S3method(key_colnames,default)
 S3method(key_colnames,epi_archive)
 S3method(key_colnames,epi_df)
 S3method(mean,epi_df)
-S3method(next_after,Date)
-S3method(next_after,integer)
 S3method(print,epi_archive)
 S3method(print,epi_df)
 S3method(print,grouped_epi_archive)
@@ -85,11 +83,9 @@ export(guess_period)
 export(is_epi_df)
 export(is_grouped_epi_archive)
 export(key_colnames)
-export(max_version_with_row_in)
 export(mutate)
 export(new_epi_archive)
 export(new_epi_df)
-export(next_after)
 export(relocate)
 export(rename)
 export(revision_summary)

diff --git a/R/archive.R b/R/archive.R
@@ -22,7 +22,7 @@
 #'
 #' @section Side effects: raises an error if version bound appears invalid
 #'
-#' @noRd
+#' @keywords internal
 validate_version_bound <- function(version_bound, x, na_ok = FALSE,
                                    version_bound_arg = rlang::caller_arg(version_bound),
                                    x_arg = rlang::caller_arg(x)) {
@@ -77,7 +77,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE,
 #'
 #' @importFrom checkmate check_names
 #'
-#' @export
+#' @keywords internal
 max_version_with_row_in <- function(x) {
   if (nrow(x) == 0L) {
     cli_abort(
@@ -108,45 +108,18 @@ max_version_with_row_in <- function(x) {
 #' @param x the starting "value"(s)
 #' @return same class, typeof, and length as `x`
 #'
-#' @export
+#' @keywords internal
 next_after <- function(x) UseMethod("next_after")
 
 
-#' @export
+#' @keywords internal
 next_after.integer <- function(x) x + 1L
 
 
-#' @export
+#' @keywords internal
 next_after.Date <- function(x) x + 1L
 
 
-#' Compactify
-#'
-#' This section describes the internals of how compactification works in an
-#' `epi_archive()`. Compactification can potentially improve code speed or
-#' memory usage, depending on your data.
-#'
-#' In general, the last version of each observation is carried forward (LOCF) to
-#' fill in data between recorded versions, and between the last recorded
-#' update and the `versions_end`. One consequence is that the `DT` doesn't
-#' have to contain a full snapshot of every version (although this generally
-#' works), but can instead contain only the rows that are new or changed from
-#' the previous version (see `compactify`, which does this automatically).
-#' Currently, deletions must be represented as revising a row to a special
-#' state (e.g., making the entries `NA` or including a special column that
-#' flags the data as removed and performing some kind of post-processing), and
-#' the archive is unaware of what this state is. Note that `NA`s *can* be
-#' introduced by `epi_archive` methods for other reasons, e.g., in
-#' [`epix_fill_through_version`] and [`epix_merge`], if requested, to
-#' represent potential update data that we do not yet have access to; or in
-#' [`epix_merge`] to represent the "value" of an observation before the
-#' version in which it was first released, or if no version of that
-#' observation appears in the archive data at all.
-#'
-#' @name compactify
-NULL
-
-
 #' `epi_archive` object
 #'
 #' The second main data structure for storing time series in `epiprocess`. It is
@@ -174,6 +147,28 @@ NULL
 #'   on `DT` directly). Note that there can only be a single row per unique
 #'   combination of key variables.
 #'
+#' @section Compactify:
+#' This section describes the internals of how compactification works in an
+#' `epi_archive()`. Compactification can potentially improve code speed or
+#' memory usage, depending on your data.
+#'
+#' In general, the last version of each observation is carried forward (LOCF) to
+#' fill in data between recorded versions, and between the last recorded
+#' update and the `versions_end`. One consequence is that the `DT` doesn't
+#' have to contain a full snapshot of every version (although this generally
+#' works), but can instead contain only the rows that are new or changed from
+#' the previous version (see `compactify`, which does this automatically).
+#' Currently, deletions must be represented as revising a row to a special
+#' state (e.g., making the entries `NA` or including a special column that
+#' flags the data as removed and performing some kind of post-processing), and
+#' the archive is unaware of what this state is. Note that `NA`s *can* be
+#' introduced by `epi_archive` methods for other reasons, e.g., in
+#' [`epix_fill_through_version`] and [`epix_merge`], if requested, to
+#' represent potential update data that we do not yet have access to; or in
+#' [`epix_merge`] to represent the "value" of an observation before the
+#' version in which it was first released, or if no version of that
+#' observation appears in the archive data at all.
+#'
 #' @section Metadata:
 #' The following pieces of metadata are included as fields in an `epi_archive`
 #'   object:
@@ -240,7 +235,8 @@ NULL
 #'   value of `clobberable_versions_start` does not fully trust these empty
 #'   updates, and assumes that any version `>= max(x$version)` could be
 #'   clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
-#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
+#' @param compactify_tol double. the tolerance used to detect approximate
+#'   equality for compactification
 #' @return An `epi_archive` object.
 #'
 #' @importFrom data.table as.data.table key setkeyv

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -320,6 +320,7 @@ as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) {
 #' @param x An object.
 #' @return `TRUE` if the object inherits from `epi_df`.
 #'
+#' @rdname epi_df
 #' @export
 is_epi_df <- function(x) {
   inherits(x, "epi_df")

diff --git a/R/epiprocess.R b/R/epiprocess.R
@@ -1,8 +1,8 @@
 #' epiprocess: Tools for basic signal processing in epidemiology
 #'
-#' This package introduces a common data structure for epidemiological data sets
-#' measured over space and time, and offers associated utilities to perform
-#' basic signal processing tasks.
+#' This package introduces common data structures for epidemiological data sets
+#' measured across locations and time, and offers associated utilities to
+#' perform basic signal processing tasks.
 #'
 #' @importFrom checkmate assert assert_scalar assert_data_frame anyMissing
 #'             assert_logical assert_list assert_character assert_class
@@ -13,7 +13,9 @@
 #' @importFrom rlang %||%
 #' @importFrom lifecycle deprecated
 #' @name epiprocess
+#' @keywords internal
 "_PACKAGE"
+
 utils::globalVariables(c(
   ".x", ".group_key", ".ref_time_value", "resid",
   "fitted", ".response", "geo_value", "time_value",

diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -89,6 +89,7 @@ print.epi_df <- function(x, ...) {
 #' @method summary epi_df
 #' @importFrom rlang .data
 #' @importFrom stats median
+#' @rdname print.epi_df
 #' @export
 summary.epi_df <- function(object, ...) {
   cat("An `epi_df` x, with metadata:\n")
@@ -123,7 +124,7 @@ summary.epi_df <- function(object, ...) {
 #' @return `x` with any metadata dropped and the `"epi_df"` class, if previously
 #'   present, dropped
 #'
-#' @noRd
+#' @keywords internal
 decay_epi_df <- function(x) {
   attributes(x)$metadata <- NULL
   class(x) <- class(x)[class(x) != "epi_df"]
@@ -140,14 +141,16 @@ decay_epi_df <- function(x) {
 # We'll implement `[` to allow either 1d or 2d. We'll also implement some other
 # methods where we want to (try to) maintain an `epi_df`.
 
+#' dplyr_reconstruct
+#'
 #' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may
 #'   directly feed in latter from our other methods)
 #' @param template `epi_df` template to use to restore
 #' @return `epi_df` or degrade into `tbl_df`
 #' @importFrom dplyr dplyr_reconstruct
 #' @importFrom cli cli_vec
 #' @export
-#' @noRd
+#' @keywords internal
 dplyr_reconstruct.epi_df <- function(data, template) {
   # Start from a reconstruction for the backing S3 classes; this ensures that we
   # keep any grouping that has been applied:

diff --git a/R/outliers.R b/R/outliers.R
@@ -38,6 +38,7 @@
 #'   "stl", shorthand for `detect_outlr_stl()`, which detects outliers via an
 #'   STL decomposition.
 #'
+#' @rdname detect_outlr
 #' @export
 #' @importFrom dplyr select
 #' @examples
@@ -152,6 +153,7 @@ detect_outlr <- function(x = seq_along(y), y,
 #' @template outlier-detection-options
 #' @template detect-outlr-return
 #'
+#' @rdname detect_outlr
 #' @export
 #' @examples
 #' # Detect outliers based on a rolling median
@@ -244,6 +246,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
 #' The last set of arguments, `log_transform` through `replacement_multiplier`,
 #'   are exactly as in `detect_outlr_rm()`.
 #'
+#' @rdname detect_outlr
 #' @importFrom stats median
 #' @importFrom tidyselect starts_with
 #' @export

diff --git a/R/slide.R b/R/slide.R
@@ -893,8 +893,9 @@ epi_slide_opt <- function(
 #'
 #' @template opt-slide-details
 #'
+#' @rdname epi_slide_opt
 #' @export
-#' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_sum`]
+#' @seealso [`epi_slide`]
 #' @examples
 #' # slide a 7-day trailing average formula on cases
 #' jhu_csse_daily_subset %>%
@@ -1007,8 +1008,9 @@ epi_slide_mean <- function(
 #'
 #' @template opt-slide-details
 #'
+#' @rdname epi_slide_opt
 #' @export
-#' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_mean`]
+#' @seealso [`epi_slide`]
 #' @examples
 #' # slide a 7-day trailing sum formula on cases
 #' jhu_csse_daily_subset %>%
@@ -1074,7 +1076,7 @@ epi_slide_sum <- function(
 #' function (using `validate_slide_window_arg`).
 #'
 #' @importFrom checkmate assert_function
-#' @noRd
+#' @keywords internal
 full_date_seq <- function(x, before, after, time_type) {
   if (!time_type %in% c("day", "week", "yearmonth", "integer")) {
     cli_abort(

diff --git a/R/utils.R b/R/utils.R
@@ -182,7 +182,7 @@ format_tibble_row <- function(x, empty = "*none*") {
 #' @importFrom purrr map_lgl
 #' @importFrom utils tail
 #'
-#' @noRd
+#' @keywords internal
 assert_sufficient_f_args <- function(.f, ..., .ref_time_value_label) {
   mandatory_f_args_labels <- c("window data", "group key", .ref_time_value_label)
   n_mandatory_f_args <- length(mandatory_f_args_labels)
@@ -670,6 +670,7 @@ upcase_snake_case <- function(vec) {
 #' the full list of potential substitutions for the `time_value` column name:
 #' `r time_column_names()`
 #' @export
+#' @keywords internal
 time_column_names <- function() {
   substitutions <- c(
     "time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date",
@@ -686,6 +687,7 @@ time_column_names <- function() {
 #' the full list of potential substitutions for the `geo_value` column name:
 #' `r geo_column_names()`
 #' @export
+#' @keywords internal
 geo_column_names <- function() {
   substitutions <- c(
     "geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip",
@@ -702,6 +704,7 @@ geo_column_names <- function() {
 #' the full list of potential substitutions for the `version` column name:
 #' `r version_column_names()`
 #' @export
+#' @keywords internal
 version_column_names <- function() {
   substitutions <- c(
     "version", "issue", "release"
@@ -833,7 +836,7 @@ list2var <- function(x) {
 #'
 #' @importFrom lifecycle deprecated
 #'
-#' @noRd
+#' @keywords internal
 deprecated_quo_is_present <- function(quo) {
   if (!rlang::is_quosure(quo)) {
     cli_abort("`quo` must be a quosure; `enquo` the arg first",
@@ -991,6 +994,7 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) {
 #'   by adding `k * result` for an integer k, and such that there is no smaller
 #'   `result` that can achieve this.
 #'
+#' @keywords internal
 #' @export
 guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
   UseMethod("guess_period")

diff --git a/README.Rmd b/README.Rmd
@@ -16,45 +16,39 @@ ggplot2::theme_set(ggplot2::theme_bw())
 
 # epiprocess
 
-## TODO: Condense these paragraphs
-
-The [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) package works
-with epidemiological time series data to provide situational
-awareness, processing, and transformations in preparation for modeling, and
-version-faithful model backtesting. It contains:
-
-- `epi_df`, a class for working with epidemiological time series data which
-behaves like a tibble (and can be manipulated with
-[`{dplyr}`](https://dplyr.tidyverse.org/)-esque "verbs") but with some
-additional structure;
-- `epi_archive`, a class for working with the version history of such time series data;
-- sample epidemiological data in these formats;
-
-This package is provided by the Delphi group at Carnegie Mellon University. The
-Delphi group provides many tools also hosts the Delphi Epidata API, which provides access to a wide
-range of epidemiological data sets, including COVID-19 data, flu data, and more.
-This package is designed to work seamlessly with the data in the Delphi Epidata
-API, which can be accessed using the `epidatr` package.
-
-It is part of a broader suite of packages that includes
-[`{epipredict}`](https://cmu-delphi.github.io/epipredict/),
-[`{epidatr}`](https://cmu-delphi.github.io/epidatr/),
-[`{rtestim}`](https://dajmcdon.github.io/rtestim/), and
-[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/), for accessing,
-analyzing, and forecasting epidemiological time series data. We have expanded
-documentation and demonstrations for some of these packages available in an
-online "book" format [here](https://cmu-delphi.github.io/delphi-tooling-book/).
-
-## Motivation
-
-[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) and
-[`{epipredict}`](https://cmu-delphi.github.io/epipredict/) are designed to lower
-the barrier to entry and implementation cost for epidemiological time series
-analysis and forecasting. Epidemiologists and forecasting groups repeatedly and
-separately have had to rush to implement this type of functionality in a much
-more ad hoc manner; we are trying to save such effort in the future by providing
-well-documented, tested, and general packages that can be called for many common
-tasks instead.
+The `{epiprocess}` package works with epidemiological time series data and
+provides tools to manage, analyze, and process the data in preparation for
+modeling. It is designed to work in tandem with
+[`{epipredict}`](https://cmu-delphi.github.io/epipredict/), which provides
+pre-built epiforecasting models and as well as tools to build custom models.
+Both packages are designed to lower the barrier to entry and implementation cost
+for epidemiological time series analysis and forecasting.
+
+`{epiprocess}` contains:
+
+- `epi_df()` and `epi_archive()`, two data frame classes (that work like a
+`{tibble}` with `{dplyr}` verbs) for working with epidemiological time
+series data;
+- signal processing tools building on these data structures such as
+  - `epi_slide()` for sliding window operations;
+  - `epix_slide()` for sliding window operations on archives;
+  - `growth_rate()` for computing growth rates;
+  - `detect_outlr()` for outlier detection;
+  - `epi_cor()` for computing correlations;
+
+If you are new to this set of tools, you may be interested learning through a
+book format: [Introduction to Epidemiological
+Forecasting](https://cmu-delphi.github.io/delphi-tooling-book/).
+
+You may also be interested in:
+
+- [`{epidatr}`](https://cmu-delphi.github.io/epidatr/), for accessing wide range
+of epidemiological data sets, including COVID-19 data, flu data, and more.
+- [`{rtestim}`](https://dajmcdon.github.io/rtestim/), a package for estimating
+the time-varying reproduction number of an epidemic.
+
+This package is provided by the [Delphi group](https://delphi.cmu.edu/) at
+Carnegie Mellon University.
 
 ## Installation
 
@@ -133,7 +127,7 @@ edf %>%
   mutate(cases_growth = growth_rate(x = time_value, y = cases_cumulative, method = "rel_change", h = 7))
 ```
 
-Detect outliers in the growth rate of the confirmed cumulative cases for each
+Detect outliers in daily reported cases for each geo_value
 
 ```{r}
 edf %>%
@@ -163,3 +157,4 @@ edf %>%
   epi_slide_mean(deaths_daily, .window_size = 7, na.rm = TRUE) %>%
   epi_cor(cases_daily, deaths_daily)
 ```
+