From 3d3f6b594716991ed939ba956965d158ece4559f Mon Sep 17 00:00:00 2001 From: "Logan C. Brook" Date: Fri, 15 Dec 2023 05:14:44 -0800 Subject: [PATCH 1/8] Bump epidatr & epipredict versions, update due to breaking changes Bump versions, update renv.lock. Adjust for breaking changes: - Some obvious renames. - Replace both `pivot_quantiles` and `unnest` `pivot_wider` patterns to `pivot_quantiles_wider`. - Fix some `epi_recipe` and `frosting` printing that doesn't play well with knitr now. Update _freeze files where a first pass visual diff identified "real" differences rather than just a tiny visual offset. --- _common.R | 11 + _freeze/archive/execute-results/html.json | 4 +- .../archive/figure-html/unnamed-chunk-9-1.svg | 1179 ++-- .../figure-html/unnamed-chunk-10-1.svg | 297 +- .../figure-html/unnamed-chunk-4-1.svg | 345 +- .../figure-html/unnamed-chunk-6-1.svg | 381 +- .../figure-html/unnamed-chunk-8-1.svg | 271 +- _freeze/epidf/execute-results/html.json | 4 +- _freeze/epipredict/execute-results/html.json | 4 +- .../execute-results/html.json | 4 +- .../figure-html/unnamed-chunk-15-1.svg | 879 +++ .../execute-results/html.json | 4 +- .../growth-rates/execute-results/html.json | 2 +- _freeze/index/execute-results/html.json | 4 +- _freeze/outliers/execute-results/html.json | 2 +- .../execute-results/html.json | 4 +- _freeze/slide/execute-results/html.json | 4 +- .../execute-results/html.json | 4 +- .../execute-results/html.json | 2 +- .../figure-html/unnamed-chunk-23-1.svg | 647 +- .../figure-html/unnamed-chunk-26-1.svg | 273 +- .../execute-results/html.json | 2 +- .../figure-html/unnamed-chunk-21-1.svg | 5234 ++++++++--------- .../figure-html/unnamed-chunk-24-1.svg | 5180 ++++++++-------- archive.qmd | 7 +- epidf.qmd | 6 +- epipredict.qmd | 4 +- flatline-forecaster.qmd | 32 +- forecast-framework.qmd | 11 +- index.qmd | 4 +- packages.bib | 15 +- preprocessing-and-models.qmd | 24 +- renv.lock | 275 +- slide.qmd | 6 +- sliding-forecasters.qmd | 6 +- 35 files changed, 8082 insertions(+), 7049 deletions(-) create mode 100644 _freeze/flatline-forecaster/figure-html/unnamed-chunk-15-1.svg diff --git a/_common.R b/_common.R index e334be9..864ca0a 100644 --- a/_common.R +++ b/_common.R @@ -42,3 +42,14 @@ options( ggplot2::theme_set(ggplot2::theme_bw()) +# Workaround for interleaved `cat`s and `message`s (from `cli`) getting +# intercepted and not combined properly by `collapse: true`: +with_messages_cat_to_stdout <- function(code) { + withCallingHandlers( + code, + message = function(m) { + cat(m$message) + tryInvokeRestart("muffleMessage") + } + ) +} diff --git a/_freeze/archive/execute-results/html.json b/_freeze/archive/execute-results/html.json index ce9fb65..29d86a0 100644 --- a/_freeze/archive/execute-results/html.json +++ b/_freeze/archive/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "a408bcb10d91a497bce0dca3fe63c34d", + "hash": "4d2b35f719103c28883f0aeeaf2e3e2b", "result": { - "markdown": "# Work with archive objects and data revisions\n\nIn addition to the `epi_df` data structure, which we have been working with all\nalong in these vignettes, the `epiprocess` package has a companion structure\ncalled `epi_archive`. In comparison to an `epi_df` object, which can be seen as\nstoring a single snapshot of a data set with the most up-to-date signal values\nas of some given time, an `epi_archive` object stores the full version history\nof a data set. Many signals of interest for epidemiological tracking are subject\nto revision (some more than others), and paying attention to data revisions can\nbe important for all sorts of downstream data analysis and modeling tasks.\n\nThis chapter walks through working with `epi_archive` objects and demonstrates\nsome of their key functionality. We'll work with a signal on the percentage of\ndoctor's visits with CLI (COVID-like illness) computed from medical insurance\nclaims, available through the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This\nsignal is subject to very heavy and regular revision; you can read more about it\non its [API documentation\npage](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`.\n\n\n\n\n\n\n## Getting data into `epi_archive` format\n\nAn `epi_archive` object\ncan be constructed from a data frame, data table, or tibble, provided that it\nhas (at least) the following columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n* `version`: the time value specifying the version for each row of measurements.\n For example, if in a given row the `version` is January 15, 2022 and\n `time_value` is January 14, 2022, then this row contains the measurements of\n the data for January 14, 2022 that were available one day later.\n\nAs we can see from the above, the data frame returned by\n`epidatr::covidcast()` has the columns required for the `epi_archive`\nformat, so we use\n`as_epi_archive()` to cast it into `epi_archive` format.[^1]\n\n[^1]: For a discussion of the removal of\nredundant version updates in `as_epi_archive` using compactify, please refer\nto the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/compactify.html).\n\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-2_39c5cbdbb56253b327ea66e6ab4e8220'}\n\n```{.r .cell-code}\nx <- archive_cases_dv_subset_dt %>%\n select(geo_value, time_value, version, percent_cli) %>%\n as_epi_archive(compactify = TRUE)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_archive\" \"R6\"\n```\n:::\n\n```{.r .cell-code}\nprint(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_archive` object, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> ----------\n#> * min time value = 2020-06-01\n#> * max time value = 2021-11-30\n#> * first version with update = 2020-06-02\n#> * last version with update = 2021-12-01\n#> * No clobberable versions\n#> * versions end = 2021-12-01\n#> ----------\n#> Data archive (stored in DT field): 119316 x 4\n#> Columns in DT: geo_value, time_value, version, percent_cli\n#> ----------\n#> Public R6 methods: initialize, print, as_of, fill_through_version, \n#> truncate_versions_after, merge, group_by, slide, clone\n```\n:::\n:::\n\n\nAn `epi_archive` is special kind of class called an R6 class. Its primary field\nis a data table `DT`, which is of class `data.table` (from the `data.table`\npackage), and has columns `geo_value`, `time_value`, `version`, as well as any\nnumber of additional columns.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-3_99d23f4e3321a367498344c4b6282562'}\n\n```{.r .cell-code}\nclass(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"data.table\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n:::\n\n\nThe variables `geo_value`, `time_value`, `version` serve as **key variables**\nfor the data table, as well as any other specified in the metadata (described\nbelow). There can only be a single row per unique combination of key variables,\nand therefore the key variables are critical for figuring out how to generate a\nsnapshot of data from the archive, as of a given version (also described below).\n \n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-4_8b3712fe1140194d1eb702521cf15238'}\n\n```{.r .cell-code}\nkey(x$DT)\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in key(x$DT): could not find function \"key\"\n```\n:::\n:::\n\n \nIn general, the last version of each observation is carried forward (LOCF) to\nfill in data between recorded versions. **A word of caution:** R6 objects,\nunlike most other objects in R, have reference semantics. An important\nconsequence of this is that objects are not copied when modified.\n \n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-5_86ba88485d14cbc7ddf328b75c606b4d'}\n\n```{.r .cell-code}\noriginal_value <- x$DT$percent_cli[1]\ny <- x # This DOES NOT make a copy of x\ny$DT$percent_cli[1] <- 0\nhead(y$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 0.000000\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 0.000000\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n\n```{.r .cell-code}\nx$DT$percent_cli[1] <- original_value\n```\n:::\n\n\nTo make a copy, we can use the `clone()` method for an R6 class, as in `y <-\nx$clone()`. You can read more about reference semantics in Hadley Wickham's\n[Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book.\n\n## Some details on metadata\n\nThe following pieces of metadata are included as fields in an `epi_archive`\nobject: \n\n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `additional_metadata`: list of additional metadata for the data archive.\n\nMetadata for an `epi_archive` object `x` can be accessed (and altered) directly,\nas in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function\n`as_epi_archive()` attempts to guess metadata fields when an `epi_archive`\nobject is instantiated, if they are not explicitly specified in the function\ncall (as it did in the case above).\n\n## Producing snapshots in `epi_df` form\n\nA key method of an `epi_archive` class is `as_of()`, which generates a snapshot\nof the archive in `epi_df` format. This represents the most up-to-date values of\nthe signal variables as of a given version. This can be accessed via `x$as_of()`\nfor an `epi_archive` object `x`, but the package also provides a simple wrapper \nfunction `epix_as_of()` since this is likely a more familiar interface for users\nnot familiar with R6 (or object-oriented programming).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-6_0150335f0031c0eb619a4ab5e1b2b899'}\n\n```{.r .cell-code}\nx_snapshot <- epix_as_of(x, max_version = as.Date(\"2021-06-01\"))\nclass(x_snapshot)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nx_snapshot\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,460 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2021-06-01\n#> \n#> # A tibble: 1,460 × 3\n#> geo_value time_value percent_cli\n#> * \n#> 1 ca 2020-06-01 2.75\n#> 2 ca 2020-06-02 2.57\n#> 3 ca 2020-06-03 2.48\n#> 4 ca 2020-06-04 2.41\n#> 5 ca 2020-06-05 2.57\n#> 6 ca 2020-06-06 2.63\n#> # ℹ 1,454 more rows\n```\n:::\n\n```{.r .cell-code}\nmax(x_snapshot$time_value)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-05-31\"\n```\n:::\n\n```{.r .cell-code}\nattributes(x_snapshot)$metadata$as_of\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-06-01\"\n```\n:::\n:::\n\n\nWe can see that the max time value in the `epi_df` object `x_snapshot` that was \ngenerated from the archive is May 29, 2021, even though the specified version\ndate was June 1, 2021. From this we can infer that the doctor's visits signal\nwas 2 days latent on June 1. Also, we can see that the metadata in the `epi_df`\nobject has the version date recorded in the `as_of` field.\n\nBy default, using the maximum of the `version` column in the underlying data table in an\n`epi_archive` object itself generates a snapshot of the latest values of signal\nvariables in the entire archive. The `epix_as_of()` function issues a warning in\nthis case, since updates to the current version may still come in at a later \npoint in time, due to various reasons, such as synchronization issues.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-7_d5f40a899e63f06b4b5411a752857f2a'}\n\n```{.r .cell-code}\nx_latest <- epix_as_of(x, max_version = max(x$DT$version))\n```\n:::\n\n\nBelow, we pull several snapshots from the archive, spaced one month apart. We\noverlay the corresponding signal curves as colored lines, with the version dates\nmarked by dotted vertical lines, and draw the latest curve in black (from the \nlatest snapshot `x_latest` that the archive can provide).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-8_204613e6af4268fe83f46e1635e0ba9e'}\n\n```{.r .cell-code}\nself_max <- max(x$DT$version)\nversions <- seq(as.Date(\"2020-06-01\"), self_max - 1, by = \"1 month\")\nsnapshots <- map(\n versions,\n function(v) {\n epix_as_of(x, max_version = v) %>% mutate(version = v)\n }\n) %>%\n list_rbind() %>%\n bind_rows(x_latest %>% mutate(version = self_max)) %>%\n mutate(latest = version == self_max)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-9_abb01f2c77a56adc9b3456f605179f88'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n snapshots %>% filter(!latest),\n aes(x = time_value, y = percent_cli)\n) +\n geom_line(aes(color = factor(version)), na.rm = TRUE) +\n geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_color_viridis_d(option = \"A\", end = .9) +\n labs(x = \"Date\", y = \"% of doctor's visits with CLI\") +\n theme(legend.position = \"none\") +\n geom_line(\n data = snapshots %>% filter(latest),\n aes(x = time_value, y = percent_cli),\n inherit.aes = FALSE, color = \"black\", na.rm = TRUE\n )\n```\n\n::: {.cell-output-display}\n![](archive_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see some interesting and highly nontrivial revision behavior: at some\npoints in time the provisional data snapshots grossly underestimate the latest\ncurve (look in particular at Florida close to the end of 2021), and at others\nthey overestimate it (both states towards the beginning of 2021), though not \nquite as dramatically. Modeling the revision process, which is often called\n*backfill modeling*, is an important statistical problem in it of itself.\n\n\n## Merging `epi_archive` objects \n\nNow we demonstrate how to merge two `epi_archive` objects together, e.g., so\nthat grabbing data from multiple sources as of a particular version can be\nperformed with a single `as_of` call. The `epi_archive` class provides a method\n`merge()` precisely for this purpose. The wrapper function is called\n`epix_merge()`; this wrapper avoids mutating its inputs, while `x$merge` will\nmutate `x`. Below we merge the working `epi_archive` of versioned percentage CLI\nfrom outpatient visits to another one of versioned COVID-19 case reporting data,\nwhich we fetch the from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the\nrate scale (counts per 100,000 people in the population).\n\nWhen merging archives, unless the archives have identical data release patterns,\n`NA`s can be introduced in the non-key variables for a few reasons:\n- to represent the \"value\" of an observation before its initial release (when we\n need to pair it with additional observations from the other archive that have\n been released)\n- to represent the \"value\" of an observation that has no recorded versions at\n all (in the same sort of situation)\n- if requested via `sync = \"na\"`, to represent potential update data that we do\n not yet have access to (e.g., due to encountering issues while attempting to\n download the currently available version data for one of the archives, but not\n the other).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-10_a7df6e05f56128886189457e1bf6c106'}\n\n```{.r .cell-code}\n# This code is for illustration and doesn't run.\n# The result is saved/loaded in the (hidden) next chunk from `{epidatasets}`\ny <- covidcast(\n data_source = \"jhu-csse\",\n signals = \"confirmed_7dav_incidence_prop\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200601, 20211201),\n geo_values = \"ca,fl,ny,tx\",\n issues = epirange(20200601, 20211201)\n) %>%\n fetch() %>%\n select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%\n as_epi_archive(compactify = TRUE)\n\nx$merge(y, sync = \"locf\", compactify = FALSE)\nprint(x)\nhead(x$DT)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-11_02fcba02d29e69cfaaf1db0683d5eb4c'}\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_archive` object, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> ----------\n#> * min time value = 2020-06-01\n#> * max time value = 2021-11-30\n#> * first version with update = 2020-06-02\n#> * last version with update = 2021-12-01\n#> * No clobberable versions\n#> * versions end = 2021-12-01\n#> ----------\n#> Data archive (stored in DT field): 129638 x 5\n#> Columns in DT: geo_value, time_value, version, percent_cli and 1 more columns\n#> ----------\n#> Public R6 methods: initialize, print, as_of, fill_through_version, \n#> truncate_versions_after, merge, group_by, slide, clone\n```\n:::\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli case_rate_7d_av\n#> 1: ca 2020-06-01 2020-06-02 NA 6.628329\n#> 2: ca 2020-06-01 2020-06-06 2.140116 6.628329\n#> 3: ca 2020-06-01 2020-06-07 2.140116 6.628329\n#> 4: ca 2020-06-01 2020-06-08 2.140379 6.628329\n#> 5: ca 2020-06-01 2020-06-09 2.114430 6.628329\n#> 6: ca 2020-06-01 2020-06-10 2.133677 6.628329\n```\n:::\n:::\n\n\nImportantly, see that `x$merge` mutated `x` to hold the result of the merge. We\ncould also have used `xy = epix_merge(x, y)` to avoid mutating `x`. See the\ndocumentation for either for more detailed descriptions of what mutation,\npointer aliasing, and pointer reseating is possible.\n\n## Sliding version-aware computations\n \n::: {.callout-note}\nTODO: need a simple example here.\n:::", + "markdown": "# Work with archive objects and data revisions\n\nIn addition to the `epi_df` data structure, which we have been working with all\nalong in these vignettes, the `epiprocess` package has a companion structure\ncalled `epi_archive`. In comparison to an `epi_df` object, which can be seen as\nstoring a single snapshot of a data set with the most up-to-date signal values\nas of some given time, an `epi_archive` object stores the full version history\nof a data set. Many signals of interest for epidemiological tracking are subject\nto revision (some more than others), and paying attention to data revisions can\nbe important for all sorts of downstream data analysis and modeling tasks.\n\nThis chapter walks through working with `epi_archive` objects and demonstrates\nsome of their key functionality. We'll work with a signal on the percentage of\ndoctor's visits with CLI (COVID-like illness) computed from medical insurance\nclaims, available through the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This\nsignal is subject to very heavy and regular revision; you can read more about it\non its [API documentation\npage](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`.\n\n\n\n\n\n\n## Getting data into `epi_archive` format\n\nAn `epi_archive` object\ncan be constructed from a data frame, data table, or tibble, provided that it\nhas (at least) the following columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n* `version`: the time value specifying the version for each row of measurements.\n For example, if in a given row the `version` is January 15, 2022 and\n `time_value` is January 14, 2022, then this row contains the measurements of\n the data for January 14, 2022 that were available one day later.\n\nAs we can see from the above, the data frame returned by\n`epidatr::covidcast()` has the columns required for the `epi_archive`\nformat, so we use\n`as_epi_archive()` to cast it into `epi_archive` format.[^1]\n\n[^1]: For a discussion of the removal of\nredundant version updates in `as_epi_archive` using compactify, please refer\nto the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/compactify.html).\n\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-2_39c5cbdbb56253b327ea66e6ab4e8220'}\n\n```{.r .cell-code}\nx <- archive_cases_dv_subset_dt %>%\n select(geo_value, time_value, version, percent_cli) %>%\n as_epi_archive(compactify = TRUE)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_archive\" \"R6\"\n```\n:::\n\n```{.r .cell-code}\nprint(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_archive` object, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> ----------\n#> * min time value = 2020-06-01\n#> * max time value = 2021-11-30\n#> * first version with update = 2020-06-02\n#> * last version with update = 2021-12-01\n#> * No clobberable versions\n#> * versions end = 2021-12-01\n#> ----------\n#> Data archive (stored in DT field): 119316 x 4\n#> Columns in DT: geo_value, time_value, version, percent_cli\n#> ----------\n#> Public R6 methods: initialize, print, as_of, fill_through_version, \n#> truncate_versions_after, merge, group_by, slide, clone\n```\n:::\n:::\n\n\nAn `epi_archive` is special kind of class called an R6 class. Its primary field\nis a data table `DT`, which is of class `data.table` (from the `data.table`\npackage), and has columns `geo_value`, `time_value`, `version`, as well as any\nnumber of additional columns.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-3_99d23f4e3321a367498344c4b6282562'}\n\n```{.r .cell-code}\nclass(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"data.table\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n:::\n\n\nThe variables `geo_value`, `time_value`, `version` serve as **key variables**\nfor the data table, as well as any other specified in the metadata (described\nbelow). There can only be a single row per unique combination of key variables,\nand therefore the key variables are critical for figuring out how to generate a\nsnapshot of data from the archive, as of a given version (also described below).\n \n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-4_8b3712fe1140194d1eb702521cf15238'}\n\n```{.r .cell-code}\nkey(x$DT)\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in key(x$DT): could not find function \"key\"\n```\n:::\n:::\n\n \nIn general, the last version of each observation is carried forward (LOCF) to\nfill in data between recorded versions. **A word of caution:** R6 objects,\nunlike most other objects in R, have reference semantics. An important\nconsequence of this is that objects are not copied when modified.\n \n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-5_86ba88485d14cbc7ddf328b75c606b4d'}\n\n```{.r .cell-code}\noriginal_value <- x$DT$percent_cli[1]\ny <- x # This DOES NOT make a copy of x\ny$DT$percent_cli[1] <- 0\nhead(y$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 0.000000\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 0.000000\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n\n```{.r .cell-code}\nx$DT$percent_cli[1] <- original_value\n```\n:::\n\n\nTo make a copy, we can use the `clone()` method for an R6 class, as in `y <-\nx$clone()`. You can read more about reference semantics in Hadley Wickham's\n[Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book.\n\n## Some details on metadata\n\nThe following pieces of metadata are included as fields in an `epi_archive`\nobject: \n\n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `additional_metadata`: list of additional metadata for the data archive.\n\nMetadata for an `epi_archive` object `x` can be accessed (and altered) directly,\nas in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function\n`as_epi_archive()` attempts to guess metadata fields when an `epi_archive`\nobject is instantiated, if they are not explicitly specified in the function\ncall (as it did in the case above).\n\n## Producing snapshots in `epi_df` form\n\nA key method of an `epi_archive` class is `as_of()`, which generates a snapshot\nof the archive in `epi_df` format. This represents the most up-to-date values of\nthe signal variables as of a given version. This can be accessed via `x$as_of()`\nfor an `epi_archive` object `x`, but the package also provides a simple wrapper \nfunction `epix_as_of()` since this is likely a more familiar interface for users\nnot familiar with R6 (or object-oriented programming).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-6_0150335f0031c0eb619a4ab5e1b2b899'}\n\n```{.r .cell-code}\nx_snapshot <- epix_as_of(x, max_version = as.Date(\"2021-06-01\"))\nclass(x_snapshot)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nx_snapshot\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,460 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2021-06-01\n#> \n#> # A tibble: 1,460 × 3\n#> geo_value time_value percent_cli\n#> * \n#> 1 ca 2020-06-01 2.75\n#> 2 ca 2020-06-02 2.57\n#> 3 ca 2020-06-03 2.48\n#> 4 ca 2020-06-04 2.41\n#> 5 ca 2020-06-05 2.57\n#> 6 ca 2020-06-06 2.63\n#> # ℹ 1,454 more rows\n```\n:::\n\n```{.r .cell-code}\nmax(x_snapshot$time_value)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-05-31\"\n```\n:::\n\n```{.r .cell-code}\nattributes(x_snapshot)$metadata$as_of\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-06-01\"\n```\n:::\n:::\n\n\nWe can see that the max time value in the `epi_df` object `x_snapshot` that was \ngenerated from the archive is May 29, 2021, even though the specified version\ndate was June 1, 2021. From this we can infer that the doctor's visits signal\nwas 2 days latent on June 1. Also, we can see that the metadata in the `epi_df`\nobject has the version date recorded in the `as_of` field.\n\nBy default, using the maximum of the `version` column in the underlying data table in an\n`epi_archive` object itself generates a snapshot of the latest values of signal\nvariables in the entire archive. The `epix_as_of()` function issues a warning in\nthis case, since updates to the current version may still come in at a later \npoint in time, due to various reasons, such as synchronization issues.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-7_d5f40a899e63f06b4b5411a752857f2a'}\n\n```{.r .cell-code}\nx_latest <- epix_as_of(x, max_version = max(x$DT$version))\n```\n:::\n\n\nBelow, we pull several snapshots from the archive, spaced one month apart. We\noverlay the corresponding signal curves as colored lines, with the version dates\nmarked by dotted vertical lines, and draw the latest curve in black (from the \nlatest snapshot `x_latest` that the archive can provide).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-8_204613e6af4268fe83f46e1635e0ba9e'}\n\n```{.r .cell-code}\nself_max <- max(x$DT$version)\nversions <- seq(as.Date(\"2020-06-01\"), self_max - 1, by = \"1 month\")\nsnapshots <- map(\n versions,\n function(v) {\n epix_as_of(x, max_version = v) %>% mutate(version = v)\n }\n) %>%\n list_rbind() %>%\n bind_rows(x_latest %>% mutate(version = self_max)) %>%\n mutate(latest = version == self_max)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-9_abb01f2c77a56adc9b3456f605179f88'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n snapshots %>% filter(!latest),\n aes(x = time_value, y = percent_cli)\n) +\n geom_line(aes(color = factor(version)), na.rm = TRUE) +\n geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_color_viridis_d(option = \"A\", end = .9) +\n labs(x = \"Date\", y = \"% of doctor's visits with CLI\") +\n theme(legend.position = \"none\") +\n geom_line(\n data = snapshots %>% filter(latest),\n aes(x = time_value, y = percent_cli),\n inherit.aes = FALSE, color = \"black\", na.rm = TRUE\n )\n```\n\n::: {.cell-output-display}\n![](archive_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see some interesting and highly nontrivial revision behavior: at some\npoints in time the provisional data snapshots grossly underestimate the latest\ncurve (look in particular at Florida close to the end of 2021), and at others\nthey overestimate it (both states towards the beginning of 2021), though not \nquite as dramatically. Modeling the revision process, which is often called\n*backfill modeling*, is an important statistical problem in it of itself.\n\n\n## Merging `epi_archive` objects \n\nNow we demonstrate how to merge two `epi_archive` objects together, e.g., so\nthat grabbing data from multiple sources as of a particular version can be\nperformed with a single `as_of` call. The `epi_archive` class provides a method\n`merge()` precisely for this purpose. The wrapper function is called\n`epix_merge()`; this wrapper avoids mutating its inputs, while `x$merge` will\nmutate `x`. Below we merge the working `epi_archive` of versioned percentage CLI\nfrom outpatient visits to another one of versioned COVID-19 case reporting data,\nwhich we fetch the from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the\nrate scale (counts per 100,000 people in the population).\n\nWhen merging archives, unless the archives have identical data release patterns,\n`NA`s can be introduced in the non-key variables for a few reasons:\n- to represent the \"value\" of an observation before its initial release (when we\n need to pair it with additional observations from the other archive that have\n been released)\n- to represent the \"value\" of an observation that has no recorded versions at\n all (in the same sort of situation)\n- if requested via `sync = \"na\"`, to represent potential update data that we do\n not yet have access to (e.g., due to encountering issues while attempting to\n download the currently available version data for one of the archives, but not\n the other).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-10_f17506759a99a453bf60434e742adfa5'}\n\n```{.r .cell-code}\n# This code is for illustration and doesn't run.\n# The result is saved/loaded in the (hidden) next chunk from `{epidatasets}`\ny <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_7dav_incidence_prop\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200601, 20211201),\n geo_values = \"ca,fl,ny,tx\",\n issues = epirange(20200601, 20211201)\n) %>%\n select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%\n as_epi_archive(compactify = TRUE)\n\nx$merge(y, sync = \"locf\", compactify = FALSE)\nprint(x)\nhead(x$DT)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-11_02fcba02d29e69cfaaf1db0683d5eb4c'}\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_archive` object, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> ----------\n#> * min time value = 2020-06-01\n#> * max time value = 2021-11-30\n#> * first version with update = 2020-06-02\n#> * last version with update = 2021-12-01\n#> * No clobberable versions\n#> * versions end = 2021-12-01\n#> ----------\n#> Data archive (stored in DT field): 129638 x 5\n#> Columns in DT: geo_value, time_value, version, percent_cli and 1 more columns\n#> ----------\n#> Public R6 methods: initialize, print, as_of, fill_through_version, \n#> truncate_versions_after, merge, group_by, slide, clone\n```\n:::\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli case_rate_7d_av\n#> 1: ca 2020-06-01 2020-06-02 NA 6.628329\n#> 2: ca 2020-06-01 2020-06-06 2.140116 6.628329\n#> 3: ca 2020-06-01 2020-06-07 2.140116 6.628329\n#> 4: ca 2020-06-01 2020-06-08 2.140379 6.628329\n#> 5: ca 2020-06-01 2020-06-09 2.114430 6.628329\n#> 6: ca 2020-06-01 2020-06-10 2.133677 6.628329\n```\n:::\n:::\n\n\nImportantly, see that `x$merge` mutated `x` to hold the result of the merge. We\ncould also have used `xy = epix_merge(x, y)` to avoid mutating `x`. See the\ndocumentation for either for more detailed descriptions of what mutation,\npointer aliasing, and pointer reseating is possible.\n\n## Sliding version-aware computations\n \n::: {.callout-note}\nTODO: need a simple example here.\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/archive/figure-html/unnamed-chunk-9-1.svg b/_freeze/archive/figure-html/unnamed-chunk-9-1.svg index 6583533..9807b57 100644 --- a/_freeze/archive/figure-html/unnamed-chunk-9-1.svg +++ b/_freeze/archive/figure-html/unnamed-chunk-9-1.svgdiff --git a/_freeze/correlations/figure-html/unnamed-chunk-10-1.svg b/_freeze/correlations/figure-html/unnamed-chunk-10-1.svg index 879d990..8e9b6f7 100644 --- a/_freeze/correlations/figure-html/unnamed-chunk-10-1.svg +++ b/_freeze/correlations/figure-html/unnamed-chunk-10-1.svgdiff --git a/_freeze/correlations/figure-html/unnamed-chunk-4-1.svg b/_freeze/correlations/figure-html/unnamed-chunk-4-1.svg index 0c0b14b..1fc6938 100644 --- a/_freeze/correlations/figure-html/unnamed-chunk-4-1.svg +++ b/_freeze/correlations/figure-html/unnamed-chunk-4-1.svgdiff --git a/_freeze/correlations/figure-html/unnamed-chunk-6-1.svg b/_freeze/correlations/figure-html/unnamed-chunk-6-1.svg index 23f9e06..bcd8590 100644 --- a/_freeze/correlations/figure-html/unnamed-chunk-6-1.svg +++ b/_freeze/correlations/figure-html/unnamed-chunk-6-1.svgdiff --git a/_freeze/correlations/figure-html/unnamed-chunk-8-1.svg b/_freeze/correlations/figure-html/unnamed-chunk-8-1.svg index 03701e5..4427be2 100644 --- a/_freeze/correlations/figure-html/unnamed-chunk-8-1.svg +++ b/_freeze/correlations/figure-html/unnamed-chunk-8-1.svgdiff --git a/_freeze/epidf/execute-results/html.json b/_freeze/epidf/execute-results/html.json index 00465db..b23b744 100644 --- a/_freeze/epidf/execute-results/html.json +++ b/_freeze/epidf/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "c7375aa7e394a10a2fc7601297e2725a", + "hash": "ef06ff93fa44c25cdac272e9bf1aeb9b", "result": { - "markdown": "# Getting data into epi_df format\n\n\n\n\n\nWe'll start by showing how to get data into \n`epi_df`, which is just\na tibble with a bit of special structure, and is the format assumed by all of\nthe functions in the `epiprocess` package. An `epi_df` object has (at least) the\nfollowing columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n\nIt can have any number of other columns which can serve as measured variables,\nwhich we also broadly refer to as signal variables. The documentation for\n gives more details about this data format.\n\nA data frame or tibble that has `geo_value` and `time_value` columns can be\nconverted into an `epi_df` object, using the function `as_epi_df()`. As an\nexample, we'll work with daily cumulative COVID-19 cases from four U.S. states:\nCA, FL, NY, and TX, over time span from mid 2020 to early 2022, and we'll use\nthe [`epidatr`](https://github.com/cmu-delphi/epidatr) package\nto fetch this data from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-2_efe3f1308a52e4890152042903eb8790'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(withr)\n\ncases <- covidcast(\n data_source = \"jhu-csse\",\n signals = \"confirmed_cumulative_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200301, 20220131),\n geo_values = \"ca,fl,ny,tx\"\n) %>% fetch()\n\ncolnames(cases)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"geo_value\" \"signal\" \"source\" \n#> [4] \"geo_type\" \"time_type\" \"time_value\" \n#> [7] \"direction\" \"issue\" \"lag\" \n#> [10] \"missing_value\" \"missing_stderr\" \"missing_sample_size\"\n#> [13] \"value\" \"stderr\" \"sample_size\"\n```\n:::\n:::\n\n\nAs we can see, a data frame returned by `epidatr::covidcast()` has the\ncolumns required for an `epi_df` object (along with many others). We can use\n`as_epi_df()`, with specification of some relevant metadata, to bring the data\nframe into `epi_df` format.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-3_634293240d733bec84dd8b6a5c74e634'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases,\n geo_type = \"state\",\n time_type = \"day\",\n as_of = max(cases$issue)\n) %>%\n select(geo_value, time_value, total_cases = value)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nsummary(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` x, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-03-10\n#> ----------\n#> * min time value = 2020-03-01\n#> * max time value = 2022-01-31\n#> * average rows per time value = 4\n```\n:::\n\n```{.r .cell-code}\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-03-10\n#> \n#> # A tibble: 6 × 3\n#> geo_value time_value total_cases\n#> * \n#> 1 ca 2020-03-01 19\n#> 2 fl 2020-03-01 0\n#> 3 ny 2020-03-01 0\n#> 4 tx 2020-03-01 0\n#> 5 ca 2020-03-02 23\n#> 6 fl 2020-03-02 1\n```\n:::\n\n```{.r .cell-code}\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n```\n:::\n:::\n\n\n## Some details on metadata\n\nIn general, an `epi_df` object has the following fields in its metadata:\n \n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `as_of`: the time value at which the given data were available.\n\nMetadata for an `epi_df` object `x` can be accessed (and altered) via\n`attributes(x)$metadata`. The first two fields here, `geo_type` and `time_type`,\nare not currently used by any downstream functions in the `epiprocess` package,\nand serve only as useful bits of information to convey about the data set at\nhand. The last field here, `as_of`, is one of the most unique aspects of an\n`epi_df` object.\n\nIn brief, we can think of an `epi_df` object as a single snapshot of a data set\nthat contains the most up-to-date values of some signals of interest, as of the\ntime specified `as_of`. For example, if `as_of` is January 31, 2022, then the\n`epi_df` object has the most up-to-date version of the data available as of\nJanuary 31, 2022. The `epiprocess` package also provides a companion data\nstructure called `epi_archive`, which stores the full version history of a given\ndata set. See the [archive\nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for\nmore.\n\nIf any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a \ncall to `as_epi_df()`, then this function will try to infer them from the passed\nobject. Usually, `geo_type` and `time_type` can be inferred from the `geo_value`\nand `time_value` columns, respectively, but inferring the `as_of` field is not \nas easy. See the documentation for `as_epi_df()` more details.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-4_1c364218e936aa6527bd0675ab37d455'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases) %>%\n select(geo_value, time_value, total_cases = value)\n\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n```\n:::\n:::\n\n\n## Using additional key columns in `epi_df` {#sec-additional-keys}\n\nIn the following examples we will show how to create an `epi_df` with additional keys.\n\n### Converting a `tsibble` that has county code as an extra key\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-5_28361d3ac565b78677e217c86faf03cc'}\n\n```{.r .cell-code}\nset.seed(12345)\nex1 <- tibble(\n geo_value = rep(c(\"ca\", \"fl\", \"pa\"), each = 3),\n county_code = c(\n \"06059\", \"06061\", \"06067\", \"12111\", \"12113\", \"12117\",\n \"42101\", \"42103\", \"42105\"\n ),\n time_value = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"1 day\"),\n length.out = 9\n ),\n value = rpois(9, 5)\n) %>%\n as_tsibble(index = time_value, key = c(geo_value, county_code))\n\nex1 <- as_epi_df(x = ex1, geo_type = \"state\", time_type = \"day\", as_of = \"2020-06-03\")\n```\n:::\n\n\nThe metadata now includes `county_code` as an extra key.\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-6_1c760ce7c25a1f6867568618118bb7ac'}\n\n```{.r .cell-code}\nattr(ex1, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"county_code\"\n```\n:::\n:::\n\n\n\n### Dealing with misspecified column names \n\n`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-7_52307fa1e07fa21173de3e9416897483'}\n\n```{.r .cell-code}\nex2 <- data.frame(\n state = rep(c(\"ca\", \"fl\", \"pa\"), each = 3), # misnamed\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 3), # extra key\n reported_date = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"day\"),\n length.out = 9\n ), # misnamed\n value = rpois(9, 5)\n)\nex2 %>% as_epi_df()\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in `Abort()`:\n#> ! `x` must contain a `geo_value` column.\n```\n:::\n:::\n\n\nThe columns should be renamed to match `epi_df` format. \n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-8_eea2403289899a6533606cf4f555d400'}\n\n```{.r .cell-code}\nex2 <- ex2 %>%\n rename(geo_value = state, time_value = reported_date) %>%\n as_epi_df(\n geo_type = \"state\",\n as_of = \"2020-06-03\",\n additional_metadata = list(other_keys = \"pol\")\n )\n\nattr(ex2, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"pol\"\n```\n:::\n:::\n\n\n\n### Adding additional keys to an `epi_df` object\n\nIn the above examples, all the keys are added to objects prior to conversion to\n`epi_df` objects. But this can also be accomplished afterward.\nWe'll look at an included dataset and filter to a single state for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-9_fc0625e5160d2a01eb47d18c346874ed'}\n\n```{.r .cell-code}\nex3 <- jhu_csse_county_level_subset %>%\n filter(time_value > \"2021-12-01\", state_name == \"Massachusetts\") %>%\n slice_tail(n = 6)\n\nattr(ex3, \"metadata\") # geo_type is county currently\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2022-05-23 14:35:45 PDT\"\n```\n:::\n:::\n\n\nNow we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The \"state\" `geo_type` anticipates lower-case abbreviations, so we'll match that. \n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-10_fe2c6e15016b44b9220d5fc4f6b51049'}\n\n```{.r .cell-code}\nex3 <- ex3 %>%\n as_tibble() %>% # drop the `epi_df` class before adding additional metadata\n mutate(\n state = rep(tolower(\"MA\"), 6),\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 2)\n ) %>%\n as_epi_df(additional_metadata = list(other_keys = c(\"state\", \"pol\")))\n\nattr(ex3, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-06-19 20:29:48 PDT\"\n#> \n#> $other_keys\n#> [1] \"state\" \"pol\"\n```\n:::\n:::\n\n\nNote that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys.\n\n\n\n## Working with `epi_df` objects downstream\n\nData in `epi_df` format should be easy to work with downstream, since it is a\nvery standard tabular data format; in the other vignettes, we'll walk through\nsome basic signal processing tasks using functions provided in the `epiprocess`\npackage. Of course, we can also write custom code for other downstream uses,\nlike plotting, which is pretty easy to do `ggplot2`.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-11_cf02eb699138d3d8365b66804d295fde'}\n\n```{.r .cell-code}\nggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +\n geom_line() +\n scale_color_brewer(palette = \"Set1\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Cumulative COVID-19 cases\", color = \"State\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-11-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFinally, we'll examine some data from other packages just to show how \nwe might get them into `epi_df` format. \nThe first is data on daily new (not cumulative) SARS \ncases in Canada in 2003, from the \n[outbreaks](https://github.com/reconverse/outbreaks) package. New cases are\nbroken into a few categories by provenance.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-12_f4dc254695766edbb2625b67c42932b7'}\n\n```{.r .cell-code}\nx <- outbreaks::sars_canada_2003 %>%\n mutate(geo_value = \"ca\") %>%\n select(geo_value, time_value = date, starts_with(\"cases\")) %>%\n as_epi_df(geo_type = \"nation\")\n\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 6 with metadata:\n#> * geo_type = nation\n#> * time_type = day\n#> * as_of = 2023-06-19 20:29:48.463959\n#> \n#> # A tibble: 6 × 6\n#> geo_value time_value cases_travel cases_household cases_healthcare\n#> * \n#> 1 ca 2003-02-23 1 0 0\n#> 2 ca 2003-02-24 0 0 0\n#> 3 ca 2003-02-25 0 0 0\n#> 4 ca 2003-02-26 0 1 0\n#> 5 ca 2003-02-27 0 0 0\n#> 6 ca 2003-02-28 1 0 0\n#> # ℹ 1 more variable: cases_other \n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-13_af68bf6df70c76b27435c6f2822266e9'}\n\n```{.r .cell-code code-fold=\"true\"}\nx <- x %>%\n pivot_longer(starts_with(\"cases\"), names_to = \"type\") %>%\n mutate(type = substring(type, 7))\n\nggplot(x, aes(x = time_value, y = value)) +\n geom_col(aes(fill = type), just = 0.5) +\n scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"SARS cases in Canada\", fill = \"Type\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-13-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThis next example examines data on new cases of Ebola in Sierra Leone in 2014 (from the same package).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-14_09c7102254a1a233a78be842fcaf2096'}\n\n```{.r .cell-code}\nx <- outbreaks::ebola_sierraleone_2014 %>%\n mutate(\n cases = ifelse(status == \"confirmed\", 1, 0),\n province = case_when(\n district %in% c(\"Kailahun\", \"Kenema\", \"Kono\") ~ \"Eastern\",\n district %in% c(\n \"Bombali\", \"Kambia\", \"Koinadugu\",\n \"Port Loko\", \"Tonkolili\"\n ) ~ \"Northern\",\n district %in% c(\"Bo\", \"Bonthe\", \"Moyamba\", \"Pujehun\") ~ \"Sourthern\",\n district %in% c(\"Western Rural\", \"Western Urban\") ~ \"Western\"\n )\n ) %>%\n select(geo_value = province, time_value = date_of_onset, cases) %>%\n filter(cases == 1) %>%\n group_by(geo_value, time_value) %>%\n summarise(cases = sum(cases)) %>%\n as_epi_df(geo_type = \"province\")\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-15_7b787f995e155e919b8f184101e75f87'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases)) +\n geom_col(aes(fill = geo_value), show.legend = FALSE) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Confirmed cases of Ebola in Sierra Leone\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-15-1.svg){fig-align='center' width=90%}\n:::\n:::\n", + "markdown": "# Getting data into epi_df format\n\n\n\n\n\nWe'll start by showing how to get data into \n`epi_df`, which is just\na tibble with a bit of special structure, and is the format assumed by all of\nthe functions in the `epiprocess` package. An `epi_df` object has (at least) the\nfollowing columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n\nIt can have any number of other columns which can serve as measured variables,\nwhich we also broadly refer to as signal variables. The documentation for\n gives more details about this data format.\n\nA data frame or tibble that has `geo_value` and `time_value` columns can be\nconverted into an `epi_df` object, using the function `as_epi_df()`. As an\nexample, we'll work with daily cumulative COVID-19 cases from four U.S. states:\nCA, FL, NY, and TX, over time span from mid 2020 to early 2022, and we'll use\nthe [`epidatr`](https://github.com/cmu-delphi/epidatr) package\nto fetch this data from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-2_a8b0ce831d237748edcef31c420862a2'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(withr)\n\ncases <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_cumulative_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200301, 20220131),\n geo_values = \"ca,fl,ny,tx\"\n)\n\ncolnames(cases)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"geo_value\" \"signal\" \"source\" \n#> [4] \"geo_type\" \"time_type\" \"time_value\" \n#> [7] \"direction\" \"issue\" \"lag\" \n#> [10] \"missing_value\" \"missing_stderr\" \"missing_sample_size\"\n#> [13] \"value\" \"stderr\" \"sample_size\"\n```\n:::\n:::\n\n\nAs we can see, a data frame returned by `epidatr::covidcast()` has the\ncolumns required for an `epi_df` object (along with many others). We can use\n`as_epi_df()`, with specification of some relevant metadata, to bring the data\nframe into `epi_df` format.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-3_634293240d733bec84dd8b6a5c74e634'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases,\n geo_type = \"state\",\n time_type = \"day\",\n as_of = max(cases$issue)\n) %>%\n select(geo_value, time_value, total_cases = value)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nsummary(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` x, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-03-10\n#> ----------\n#> * min time value = 2020-03-01\n#> * max time value = 2022-01-31\n#> * average rows per time value = 4\n```\n:::\n\n```{.r .cell-code}\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-03-10\n#> \n#> # A tibble: 6 × 3\n#> geo_value time_value total_cases\n#> * \n#> 1 ca 2020-03-01 19\n#> 2 fl 2020-03-01 0\n#> 3 ny 2020-03-01 0\n#> 4 tx 2020-03-01 0\n#> 5 ca 2020-03-02 23\n#> 6 fl 2020-03-02 1\n```\n:::\n\n```{.r .cell-code}\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n```\n:::\n:::\n\n\n## Some details on metadata\n\nIn general, an `epi_df` object has the following fields in its metadata:\n \n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `as_of`: the time value at which the given data were available.\n\nMetadata for an `epi_df` object `x` can be accessed (and altered) via\n`attributes(x)$metadata`. The first two fields here, `geo_type` and `time_type`,\nare not currently used by any downstream functions in the `epiprocess` package,\nand serve only as useful bits of information to convey about the data set at\nhand. The last field here, `as_of`, is one of the most unique aspects of an\n`epi_df` object.\n\nIn brief, we can think of an `epi_df` object as a single snapshot of a data set\nthat contains the most up-to-date values of some signals of interest, as of the\ntime specified `as_of`. For example, if `as_of` is January 31, 2022, then the\n`epi_df` object has the most up-to-date version of the data available as of\nJanuary 31, 2022. The `epiprocess` package also provides a companion data\nstructure called `epi_archive`, which stores the full version history of a given\ndata set. See the [archive\nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for\nmore.\n\nIf any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a \ncall to `as_epi_df()`, then this function will try to infer them from the passed\nobject. Usually, `geo_type` and `time_type` can be inferred from the `geo_value`\nand `time_value` columns, respectively, but inferring the `as_of` field is not \nas easy. See the documentation for `as_epi_df()` more details.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-4_1c364218e936aa6527bd0675ab37d455'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases) %>%\n select(geo_value, time_value, total_cases = value)\n\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n```\n:::\n:::\n\n\n## Using additional key columns in `epi_df` {#sec-additional-keys}\n\nIn the following examples we will show how to create an `epi_df` with additional keys.\n\n### Converting a `tsibble` that has county code as an extra key\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-5_28361d3ac565b78677e217c86faf03cc'}\n\n```{.r .cell-code}\nset.seed(12345)\nex1 <- tibble(\n geo_value = rep(c(\"ca\", \"fl\", \"pa\"), each = 3),\n county_code = c(\n \"06059\", \"06061\", \"06067\", \"12111\", \"12113\", \"12117\",\n \"42101\", \"42103\", \"42105\"\n ),\n time_value = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"1 day\"),\n length.out = 9\n ),\n value = rpois(9, 5)\n) %>%\n as_tsibble(index = time_value, key = c(geo_value, county_code))\n\nex1 <- as_epi_df(x = ex1, geo_type = \"state\", time_type = \"day\", as_of = \"2020-06-03\")\n```\n:::\n\n\nThe metadata now includes `county_code` as an extra key.\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-6_1c760ce7c25a1f6867568618118bb7ac'}\n\n```{.r .cell-code}\nattr(ex1, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"county_code\"\n```\n:::\n:::\n\n\n\n### Dealing with misspecified column names \n\n`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-7_52307fa1e07fa21173de3e9416897483'}\n\n```{.r .cell-code}\nex2 <- data.frame(\n state = rep(c(\"ca\", \"fl\", \"pa\"), each = 3), # misnamed\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 3), # extra key\n reported_date = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"day\"),\n length.out = 9\n ), # misnamed\n value = rpois(9, 5)\n)\nex2 %>% as_epi_df()\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in `Abort()`:\n#> ! `x` must contain a `geo_value` column.\n```\n:::\n:::\n\n\nThe columns should be renamed to match `epi_df` format. \n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-8_eea2403289899a6533606cf4f555d400'}\n\n```{.r .cell-code}\nex2 <- ex2 %>%\n rename(geo_value = state, time_value = reported_date) %>%\n as_epi_df(\n geo_type = \"state\",\n as_of = \"2020-06-03\",\n additional_metadata = list(other_keys = \"pol\")\n )\n\nattr(ex2, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"pol\"\n```\n:::\n:::\n\n\n\n### Adding additional keys to an `epi_df` object\n\nIn the above examples, all the keys are added to objects prior to conversion to\n`epi_df` objects. But this can also be accomplished afterward.\nWe'll look at an included dataset and filter to a single state for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-9_fc0625e5160d2a01eb47d18c346874ed'}\n\n```{.r .cell-code}\nex3 <- jhu_csse_county_level_subset %>%\n filter(time_value > \"2021-12-01\", state_name == \"Massachusetts\") %>%\n slice_tail(n = 6)\n\nattr(ex3, \"metadata\") # geo_type is county currently\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2022-05-23 14:35:45 PDT\"\n```\n:::\n:::\n\n\nNow we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The \"state\" `geo_type` anticipates lower-case abbreviations, so we'll match that. \n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-10_fe2c6e15016b44b9220d5fc4f6b51049'}\n\n```{.r .cell-code}\nex3 <- ex3 %>%\n as_tibble() %>% # drop the `epi_df` class before adding additional metadata\n mutate(\n state = rep(tolower(\"MA\"), 6),\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 2)\n ) %>%\n as_epi_df(additional_metadata = list(other_keys = c(\"state\", \"pol\")))\n\nattr(ex3, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-12-15 04:50:49 PST\"\n#> \n#> $other_keys\n#> [1] \"state\" \"pol\"\n```\n:::\n:::\n\n\nNote that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys.\n\n\n\n## Working with `epi_df` objects downstream\n\nData in `epi_df` format should be easy to work with downstream, since it is a\nvery standard tabular data format; in the other vignettes, we'll walk through\nsome basic signal processing tasks using functions provided in the `epiprocess`\npackage. Of course, we can also write custom code for other downstream uses,\nlike plotting, which is pretty easy to do `ggplot2`.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-11_cf02eb699138d3d8365b66804d295fde'}\n\n```{.r .cell-code}\nggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +\n geom_line() +\n scale_color_brewer(palette = \"Set1\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Cumulative COVID-19 cases\", color = \"State\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-11-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFinally, we'll examine some data from other packages just to show how \nwe might get them into `epi_df` format. \nThe first is data on daily new (not cumulative) SARS \ncases in Canada in 2003, from the \n[outbreaks](https://github.com/reconverse/outbreaks) package. New cases are\nbroken into a few categories by provenance.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-12_f4dc254695766edbb2625b67c42932b7'}\n\n```{.r .cell-code}\nx <- outbreaks::sars_canada_2003 %>%\n mutate(geo_value = \"ca\") %>%\n select(geo_value, time_value = date, starts_with(\"cases\")) %>%\n as_epi_df(geo_type = \"nation\")\n\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 6 with metadata:\n#> * geo_type = nation\n#> * time_type = day\n#> * as_of = 2023-12-15 04:50:50\n#> \n#> # A tibble: 6 × 6\n#> geo_value time_value cases_travel cases_household cases_healthcare\n#> * \n#> 1 ca 2003-02-23 1 0 0\n#> 2 ca 2003-02-24 0 0 0\n#> 3 ca 2003-02-25 0 0 0\n#> 4 ca 2003-02-26 0 1 0\n#> 5 ca 2003-02-27 0 0 0\n#> 6 ca 2003-02-28 1 0 0\n#> # ℹ 1 more variable: cases_other \n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-13_af68bf6df70c76b27435c6f2822266e9'}\n\n```{.r .cell-code code-fold=\"true\"}\nx <- x %>%\n pivot_longer(starts_with(\"cases\"), names_to = \"type\") %>%\n mutate(type = substring(type, 7))\n\nggplot(x, aes(x = time_value, y = value)) +\n geom_col(aes(fill = type), just = 0.5) +\n scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"SARS cases in Canada\", fill = \"Type\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-13-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThis next example examines data on new cases of Ebola in Sierra Leone in 2014 (from the same package).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-14_09c7102254a1a233a78be842fcaf2096'}\n\n```{.r .cell-code}\nx <- outbreaks::ebola_sierraleone_2014 %>%\n mutate(\n cases = ifelse(status == \"confirmed\", 1, 0),\n province = case_when(\n district %in% c(\"Kailahun\", \"Kenema\", \"Kono\") ~ \"Eastern\",\n district %in% c(\n \"Bombali\", \"Kambia\", \"Koinadugu\",\n \"Port Loko\", \"Tonkolili\"\n ) ~ \"Northern\",\n district %in% c(\"Bo\", \"Bonthe\", \"Moyamba\", \"Pujehun\") ~ \"Sourthern\",\n district %in% c(\"Western Rural\", \"Western Urban\") ~ \"Western\"\n )\n ) %>%\n select(geo_value = province, time_value = date_of_onset, cases) %>%\n filter(cases == 1) %>%\n group_by(geo_value, time_value) %>%\n summarise(cases = sum(cases)) %>%\n as_epi_df(geo_type = \"province\")\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-15_7b787f995e155e919b8f184101e75f87'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases)) +\n geom_col(aes(fill = geo_value), show.legend = FALSE) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Confirmed cases of Ebola in Sierra Leone\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-15-1.svg){fig-align='center' width=90%}\n:::\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/epipredict/execute-results/html.json b/_freeze/epipredict/execute-results/html.json index e6a72f4..7afd049 100644 --- a/_freeze/epipredict/execute-results/html.json +++ b/_freeze/epipredict/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "9c99451eb5966f60d81e919fe85842d1", + "hash": "82bf7f7f30b318d1b412c6d9742c4a64", "result": { - "markdown": "# Overview\n\n\n\n\n\nAt a high level, our goal with `{epipredict}` is to make running simple machine learning / statistical forecasters for epidemiology easy. However, this package is extremely extensible, and that is part of its utility. Our hope is that it is easy for users with epidemiology training and some statistics to fit baseline models while still allowing those with more nuanced statistical understanding to create complicated specializations using the same framework.\n\nServing both populations is the main motivation for our efforts, but at the same time, we have tried hard to make it useful.\n\n\n## Baseline models\n\nWe provide a set of basic, easy-to-use forecasters that work out of the box. \nYou should be able to do a reasonably limited amount of customization on them. Any serious customization happens with the framework discussed below).\n\nFor the basic forecasters, we provide: \n \n* Flatline (basic) forecaster \n* Autoregressive forecaster\n* Autoregressive classifier\n* Smooth AR forecaster\n\nAll the forcasters we provide are built on our framework. So we will use these basic models to illustrate its flexibility.\n\n## Forecasting framework\n\nAt its core, `{epipredict}` is a **framework** for creating custom forecasters.\nBy that we mean that we view the process of creating custom forecasters as\na collection of modular components. All of them should be easy to swap out\nor exchange for others, and massive variety should be available by fairly \nsimple modifications through the addition of steps or layers. \nThere are four types of components:\n \n1. Preprocessor: make transformations to the data before model training\n2. Trainer: train a model on data, resulting in a fitted model object\n3. Predictor: make predictions, using a fitted model object and processed test data\n4. Postprocessor: manipulate or transform the predictions before returning\n \nUsers familiar with [`{tidymodels}`](https://www.tidymodels.org) and especially \nthe [`{workflows}`](https://workflows.tidymodels.org) package will notice a lot \nof overlap. This is by design, and is in fact a feature. The truth is that\n`{epipredict}` is a wrapper around much that is contained in these packages.\nTherefore, if you want something from this -verse, it should \"just work\" (we hope).\n\nThe reason for the overlap is that `{workflows}` _already implements_ the first \nthree steps. And it does this very well. However, it is missing the \npostprocessing stage and currently has no plans for such an implementation. \nAnd this feature is important. The baseline forecaster we provide _requires_\npostprocessing. Anything more complicated (which is nearly everything) \nneeds this as well.\n\nThe second omission from `{tidymodels}` is support for panel data. Besides\nepidemiological data, economics, psychology, sociology, and many other areas\nfrequently deal with data of this type. So the framework of behind `{epipredict}`\nimplements this. In principle, this has nothing to do with epidemiology, and \none could simply use this package as a solution for the missing functionality in\n`{tidymodels}`. Again, this should \"just work\" (we hope).\n\nAll of the _panel data_ functionality is implemented through the `epi_df` data type\ndescribed in the previous part. If you have different panel data, just force it\ninto an `epi_df` as described in @sec-additional-keys.\n\n## Why doesn't this package already exist?\n\n- Parts of it actually DO exist. There's a universe called `tidymodels`. It \nhandles pre-processing, training, and prediction, bound together, through a \npackage called workflows. We built `epipredict` on top of that setup. In this \nway, you CAN use almost everything they provide.\n- However, workflows doesn't do post-processing to the extent envisioned here.\nAnd nothing in `tidymodels` handles panel data.\n- The tidy-team doesn't have plans to do either of these things. (We checked).\n- There are two packages that do time series built on `tidymodels`, but it's \n\"basic\" time series: 1-step AR models, exponential smoothing, STL decomposition,\netc.[^1] \n\n[^1]: Our group has not prioritized these sorts of models for epidemic \nforecasting, but one could also integrate these methods into our framework.\n\n\n## Show me the basics\n\nFor now, we'll just demonstrate one of the \"canned\" forecasters we provide: an autoregressive forecaster with (or without) covariates that _directly_ trains on the response. This is in contrast to a typical \"iterative\" AR model that trains to predict one-step-ahead, and then plugs in the predictions to \"leverage up\" to longer horizons. You saw this function in @sec-local-forecaster, but now we'll explain\nthe arguments a bit more thoroughly. Below, you'll see how to make a number of modifications to this\nforecaster, but understanding the inner workings, and **why** you would want\nsomething like this (as well as how to do elaborate customizations) \nwill be the subject of the rest of this book. \n\nWe'll use some of the same data we've examined earlier and estimate a model jointly across all locations using only the most recent 30 days of data (available\nin the built-in data frame).\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/demo-workflow_c1aed8396eb1b411c4ab154ecd15e222'}\n\n```{.r .cell-code}\njhu <- case_death_rate_subset %>%\n filter(time_value >= max(time_value) - 30)\n\nout <- arx_forecaster(\n jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\")\n)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n#> Warning: The forecast_date is less than the most recent update date of the\n#> data.forecast_date = 2021-12-31 while data is from 2022-05-31.\n```\n:::\n:::\n\n\nThis call produces a warning, which we'll ignore for now. But essentially, it's telling us that our data comes from May 2022 but we're trying to do a forecast for January 2022. The result is likely not an accurate measure of real-time forecast performance, because the data have been revised over time. \n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-2_7b68ec6f4741ca9ebb25c7a13be54061'}\n\n```{.r .cell-code}\nout\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type ARX Forecaster ════════════════════════════════\n#> \n#> This forecaster was fit on 2023-06-19 20:31:07\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-07.\n```\n:::\n:::\n\n\nPrinting the S3 object provides a bunch of summary information describing the \noriginal training data used to estimate the model as well as some information\nof what the predictions are for. It contains three main components:\n \n1. Metadata about the training data and when the forecast was created\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-3_2d86a3bbe62cc3a5e7b6c3e02059257d'}\n\n```{.r .cell-code}\nstr(out$metadata)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> List of 2\n#> $ training :List of 3\n#> ..$ geo_type : chr \"state\"\n#> ..$ time_type: chr \"day\"\n#> ..$ as_of : POSIXct[1:1], format: \"2022-05-31 12:08:25\"\n#> $ forecast_created: POSIXct[1:1], format: \"2023-06-19 20:31:07\"\n```\n:::\n:::\n\n2. The predictions in a tibble. The columns give the predictions for each location along with additional columns. By default, these are a 90% predictive interval, the `forecast_date` (the date on which the forecast was putatively made) and the `target_date` (the date for which the forecast is being made).\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-4_75eac823f860d76a7dd42bdea9e94ee1'}\n\n```{.r .cell-code}\nout$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.355 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 2 al 0.325 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 3 ar 0.496 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 4 as 0.0836 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 5 az 0.614 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 6 ca 0.327 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n3. An S3 object of class `epi_workflow`. This object encapsulates all the instructions necessary to create the prediction. More details on this below.\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-5_f9e3ce37b5ca3e9f59bd1977f249b01f'}\n\n```{.r .cell-code}\nout$epi_workflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Epi Workflow [trained] ═══════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> Postprocessor: Frosting\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 6 Recipe Steps\n#> \n#> • step_epi_lag()\n#> • step_epi_lag()\n#> • step_epi_ahead()\n#> • step_naomit()\n#> • step_naomit()\n#> • step_training_window()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) lag_0_case_rate lag_7_case_rate lag_14_case_rate \n#> 0.0829475 0.0009830 0.0027035 -0.0005651 \n#> lag_0_death_rate lag_7_death_rate lag_14_death_rate \n#> 0.2466110 0.1964921 0.0752998 \n#> \n#> ── Postprocessor ────────────────────────────────────────────────────────────\n#> 5 Frosting Layers\n#> \n#> • layer_predict()\n#> • layer_residual_quantiles()\n#> • layer_add_forecast_date()\n#> • layer_add_target_date()\n#> • layer_threshold()\n```\n:::\n:::\n\n\nBy default, the forecaster predicts the outcome (`death_rate`) 1-week ahead, \nusing 3 lags of each predictor (`case_rate` and `death_rate`) at 0 (today), \n1 week back and 2 weeks back. The predictors and outcome can be changed \ndirectly. The rest of the defaults are encapsulated into a list of arguments. \nThis list is produced by `arx_args_list()`.\n\n## Simple adjustments\n\nBasic adjustments can be made through the `args_list`.\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/differential-lags_6b8588f2553c80d191c85d09836876f9'}\n\n```{.r .cell-code}\nout2week <- arx_forecaster(\n epi_data = jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n lags = list(case_rate = c(0, 1, 2, 3, 7, 14), death_rate = c(0, 7, 14)),\n ahead = 14\n )\n)\n```\n:::\n\n\nHere, we've used different lags on the `case_rate` and are now predicting 2 \nweeks ahead. Note that `lags` and `aheads` are in the same units as the \n`time_value` of the `epi_df` used for training (same as the `epi_slide()` \narguments discussed in @sec-sliding). This example also illustrates\na major difficulty with the \"iterative\" versions of AR models. This model \ndoesn't produce forecasts for `case_rate`, and so, would not have data to \n\"plug in\" for the necessary lags.[^2]\n\n[^2]: An obvious fix is to instead use a VAR and predict both, but this would \nlikely increase the variance of the model, and therefore, may lead to less \naccurate forecasts for the variable of interest.\n\n\nAnother property of the basic model is the predictive interval. We describe this in more detail in a coming chapter, but it is easy to request multiple quantiles.\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/differential-levels_a9da683d7e7fef5e4cb6288ad9899809'}\n\n```{.r .cell-code}\nout_q <- arx_forecaster(jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99)\n )\n)\n```\n:::\n\n\nThe column `.pred_dstn` in the `predictions` object is actually a \"distribution\" here parameterized by its quantiles. For this default forecaster, these are created using the quantiles of the residuals of the predictive model (possibly symmetrized). Here, we used 23 quantiles, but one can grab a particular quantile,\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/q1_5df8261d92421ead6dd2d77e0a127517'}\n\n```{.r .cell-code}\nhead(quantile(out_q$predictions$.pred_distn, p = .4))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> 40% 40% 40% 40% 40% 40% \n#> 0.30277798 0.27213225 0.44345734 0.03120647 0.56121844 0.27492711\n```\n:::\n:::\n\n\nor extract the entire distribution into a \"long\" `epi_df` with `tau` being the probability and `q` being the value associated to that quantile.\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/q2_439cb3bc49eb03d8b4c34070ac5ba21d'}\n\n```{.r .cell-code}\nout_q$predictions %>%\n # first create a \"nested\" list-column\n mutate(.pred_distn = nested_quantiles(.pred_distn)) %>%\n unnest(.pred_distn) # then unnest it\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 1,288 × 6\n#> geo_value .pred q tau forecast_date target_date\n#> \n#> 1 ak 0.355 0 0.01 2021-12-31 2022-01-07 \n#> 2 ak 0.355 0 0.025 2021-12-31 2022-01-07 \n#> 3 ak 0.355 0.0371 0.05 2021-12-31 2022-01-07 \n#> 4 ak 0.355 0.123 0.1 2021-12-31 2022-01-07 \n#> 5 ak 0.355 0.174 0.15 2021-12-31 2022-01-07 \n#> 6 ak 0.355 0.211 0.2 2021-12-31 2022-01-07 \n#> # ℹ 1,282 more rows\n```\n:::\n:::\n\n\nAdditional simple adjustments to the basic forecaster can be made using the function:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-6_07ecdd97f1e2f61c667029fbe5d0d406'}\n\n```{.r .cell-code}\narx_args_list(\n lags = c(0L, 7L, 14L), ahead = 7L, n_training = Inf,\n forecast_date = NULL, target_date = NULL, levels = c(0.05, 0.95),\n symmetrize = TRUE, nonneg = TRUE, quantile_by_key = \"geo_value\"\n)\n```\n:::\n\n\n## Changing the engine\n\nSo far, our forecasts have been produced using simple linear regression. But this is not the only way to estimate such a model.\nThe `trainer` argument determines the type of model we want. \nThis takes a [`{parsnip}`](https://parsnip.tidymodels.org) model. The default is linear regression, but we could instead use a random forest with the `{ranger}` package:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/ranger_165b974f4c4580092d3398b1d2bee018'}\n\n```{.r .cell-code}\nout_rf <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n rand_forest(mode = \"regression\")\n)\n```\n:::\n\n\nOr boosted regression trees with `{xgboost}`:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/xgboost_59219c946dd5689d23feb924a64c64be'}\n\n```{.r .cell-code}\nout_gb <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n boost_tree(mode = \"regression\", trees = 20)\n)\n```\n:::\n\n\nOr quantile regression, using our custom forecasting engine `quantile_reg()`:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/quantreg_4e2d216ee037905d2417be2d184f5664'}\n\n```{.r .cell-code}\nout_gb <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n quantile_reg()\n)\n```\n:::\n\n\nFWIW, this last case (using quantile regression), is not far from what the Delphi production forecast team used for its Covid forecasts over the past few years.\n\n", + "markdown": "# Overview\n\n\n\n\n\nAt a high level, our goal with `{epipredict}` is to make running simple machine learning / statistical forecasters for epidemiology easy. However, this package is extremely extensible, and that is part of its utility. Our hope is that it is easy for users with epidemiology training and some statistics to fit baseline models while still allowing those with more nuanced statistical understanding to create complicated specializations using the same framework.\n\nServing both populations is the main motivation for our efforts, but at the same time, we have tried hard to make it useful.\n\n\n## Baseline models\n\nWe provide a set of basic, easy-to-use forecasters that work out of the box. \nYou should be able to do a reasonably limited amount of customization on them. Any serious customization happens with the framework discussed below).\n\nFor the basic forecasters, we provide: \n \n* Flatline (basic) forecaster \n* Autoregressive forecaster\n* Autoregressive classifier\n* Smooth AR forecaster\n\nAll the forcasters we provide are built on our framework. So we will use these basic models to illustrate its flexibility.\n\n## Forecasting framework\n\nAt its core, `{epipredict}` is a **framework** for creating custom forecasters.\nBy that we mean that we view the process of creating custom forecasters as\na collection of modular components. All of them should be easy to swap out\nor exchange for others, and massive variety should be available by fairly \nsimple modifications through the addition of steps or layers. \nThere are four types of components:\n \n1. Preprocessor: make transformations to the data before model training\n2. Trainer: train a model on data, resulting in a fitted model object\n3. Predictor: make predictions, using a fitted model object and processed test data\n4. Postprocessor: manipulate or transform the predictions before returning\n \nUsers familiar with [`{tidymodels}`](https://www.tidymodels.org) and especially \nthe [`{workflows}`](https://workflows.tidymodels.org) package will notice a lot \nof overlap. This is by design, and is in fact a feature. The truth is that\n`{epipredict}` is a wrapper around much that is contained in these packages.\nTherefore, if you want something from this -verse, it should \"just work\" (we hope).\n\nThe reason for the overlap is that `{workflows}` _already implements_ the first \nthree steps. And it does this very well. However, it is missing the \npostprocessing stage and currently has no plans for such an implementation. \nAnd this feature is important. The baseline forecaster we provide _requires_\npostprocessing. Anything more complicated (which is nearly everything) \nneeds this as well.\n\nThe second omission from `{tidymodels}` is support for panel data. Besides\nepidemiological data, economics, psychology, sociology, and many other areas\nfrequently deal with data of this type. So the framework of behind `{epipredict}`\nimplements this. In principle, this has nothing to do with epidemiology, and \none could simply use this package as a solution for the missing functionality in\n`{tidymodels}`. Again, this should \"just work\" (we hope).\n\nAll of the _panel data_ functionality is implemented through the `epi_df` data type\ndescribed in the previous part. If you have different panel data, just force it\ninto an `epi_df` as described in @sec-additional-keys.\n\n## Why doesn't this package already exist?\n\n- Parts of it actually DO exist. There's a universe called `tidymodels`. It \nhandles pre-processing, training, and prediction, bound together, through a \npackage called workflows. We built `epipredict` on top of that setup. In this \nway, you CAN use almost everything they provide.\n- However, workflows doesn't do post-processing to the extent envisioned here.\nAnd nothing in `tidymodels` handles panel data.\n- The tidy-team doesn't have plans to do either of these things. (We checked).\n- There are two packages that do time series built on `tidymodels`, but it's \n\"basic\" time series: 1-step AR models, exponential smoothing, STL decomposition,\netc.[^1] \n\n[^1]: Our group has not prioritized these sorts of models for epidemic \nforecasting, but one could also integrate these methods into our framework.\n\n\n## Show me the basics\n\nFor now, we'll just demonstrate one of the \"canned\" forecasters we provide: an autoregressive forecaster with (or without) covariates that _directly_ trains on the response. This is in contrast to a typical \"iterative\" AR model that trains to predict one-step-ahead, and then plugs in the predictions to \"leverage up\" to longer horizons. You saw this function in @sec-local-forecaster, but now we'll explain\nthe arguments a bit more thoroughly. Below, you'll see how to make a number of modifications to this\nforecaster, but understanding the inner workings, and **why** you would want\nsomething like this (as well as how to do elaborate customizations) \nwill be the subject of the rest of this book. \n\nWe'll use some of the same data we've examined earlier and estimate a model jointly across all locations using only the most recent 30 days of data (available\nin the built-in data frame).\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/demo-workflow_c1aed8396eb1b411c4ab154ecd15e222'}\n\n```{.r .cell-code}\njhu <- case_death_rate_subset %>%\n filter(time_value >= max(time_value) - 30)\n\nout <- arx_forecaster(\n jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\")\n)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n#> Warning: The forecast_date is less than the most recent update date of the\n#> data: forecast_date = 2021-12-31 while data is from 2022-05-31.\n```\n:::\n:::\n\n\nThis call produces a warning, which we'll ignore for now. But essentially, it's telling us that our data comes from May 2022 but we're trying to do a forecast for January 2022. The result is likely not an accurate measure of real-time forecast performance, because the data have been revised over time. \n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-2_7b68ec6f4741ca9ebb25c7a13be54061'}\n\n```{.r .cell-code}\nout\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type ARX Forecaster ════════════════════════════════\n#> \n#> This forecaster was fit on 2023-12-15 04:53:13\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-07.\n```\n:::\n:::\n\n\nPrinting the S3 object provides a bunch of summary information describing the \noriginal training data used to estimate the model as well as some information\nof what the predictions are for. It contains three main components:\n \n1. Metadata about the training data and when the forecast was created\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-3_2d86a3bbe62cc3a5e7b6c3e02059257d'}\n\n```{.r .cell-code}\nstr(out$metadata)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> List of 2\n#> $ training :List of 3\n#> ..$ geo_type : chr \"state\"\n#> ..$ time_type: chr \"day\"\n#> ..$ as_of : POSIXct[1:1], format: \"2022-05-31 12:08:25\"\n#> $ forecast_created: POSIXct[1:1], format: \"2023-12-15 04:53:13\"\n```\n:::\n:::\n\n2. The predictions in a tibble. The columns give the predictions for each location along with additional columns. By default, these are a 90% predictive interval, the `forecast_date` (the date on which the forecast was putatively made) and the `target_date` (the date for which the forecast is being made).\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-4_75eac823f860d76a7dd42bdea9e94ee1'}\n\n```{.r .cell-code}\nout$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.355 quantiles(0.36)[2] 2021-12-31 2022-01-07 \n#> 2 al 0.325 quantiles(0.32)[2] 2021-12-31 2022-01-07 \n#> 3 ar 0.496 quantiles(0.5)[2] 2021-12-31 2022-01-07 \n#> 4 as 0.0836 quantiles(0.2)[2] 2021-12-31 2022-01-07 \n#> 5 az 0.614 quantiles(0.61)[2] 2021-12-31 2022-01-07 \n#> 6 ca 0.327 quantiles(0.33)[2] 2021-12-31 2022-01-07 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n3. An S3 object of class `epi_workflow`. This object encapsulates all the instructions necessary to create the prediction. More details on this below.\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-5_f9e3ce37b5ca3e9f59bd1977f249b01f'}\n\n```{.r .cell-code}\nout$epi_workflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Epi Workflow [trained] ═══════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> Postprocessor: Frosting\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 6 Recipe Steps\n#> \n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) lag_0_case_rate lag_7_case_rate lag_14_case_rate \n#> 0.0829475 0.0009830 0.0027035 -0.0005651 \n#> lag_0_death_rate lag_7_death_rate lag_14_death_rate \n#> 0.2466110 0.1964921 0.0752998 \n#> \n#> ── Postprocessor ────────────────────────────────────────────────────────────\n#> 5 Frosting Layers\n```\n:::\n:::\n\n\nBy default, the forecaster predicts the outcome (`death_rate`) 1-week ahead, \nusing 3 lags of each predictor (`case_rate` and `death_rate`) at 0 (today), \n1 week back and 2 weeks back. The predictors and outcome can be changed \ndirectly. The rest of the defaults are encapsulated into a list of arguments. \nThis list is produced by `arx_args_list()`.\n\n## Simple adjustments\n\nBasic adjustments can be made through the `args_list`.\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/differential-lags_6b8588f2553c80d191c85d09836876f9'}\n\n```{.r .cell-code}\nout2week <- arx_forecaster(\n epi_data = jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n lags = list(case_rate = c(0, 1, 2, 3, 7, 14), death_rate = c(0, 7, 14)),\n ahead = 14\n )\n)\n```\n:::\n\n\nHere, we've used different lags on the `case_rate` and are now predicting 2 \nweeks ahead. Note that `lags` and `aheads` are in the same units as the \n`time_value` of the `epi_df` used for training (same as the `epi_slide()` \narguments discussed in @sec-sliding). This example also illustrates\na major difficulty with the \"iterative\" versions of AR models. This model \ndoesn't produce forecasts for `case_rate`, and so, would not have data to \n\"plug in\" for the necessary lags.[^2]\n\n[^2]: An obvious fix is to instead use a VAR and predict both, but this would \nlikely increase the variance of the model, and therefore, may lead to less \naccurate forecasts for the variable of interest.\n\n\nAnother property of the basic model is the predictive interval. We describe this in more detail in a coming chapter, but it is easy to request multiple quantiles.\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/differential-levels_d107da1eb83cee11a17c8258582355d3'}\n\n```{.r .cell-code}\nout_q <- arx_forecaster(jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99)\n )\n)\n```\n:::\n\n\nThe column `.pred_dstn` in the `predictions` object is actually a \"distribution\" here parameterized by its quantiles. For this default forecaster, these are created using the quantiles of the residuals of the predictive model (possibly symmetrized). Here, we used 23 quantiles, but one can grab a particular quantile,\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/q1_5df8261d92421ead6dd2d77e0a127517'}\n\n```{.r .cell-code}\nhead(quantile(out_q$predictions$.pred_distn, p = .4))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> 40% 40% 40% 40% 40% 40% \n#> 0.30277798 0.27213225 0.44345734 0.03120647 0.56121844 0.27492711\n```\n:::\n:::\n\n\nor extract the entire distribution into a \"long\" `epi_df` with `tau` being the probability and `q` being the value associated to that quantile.\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/q2_439cb3bc49eb03d8b4c34070ac5ba21d'}\n\n```{.r .cell-code}\nout_q$predictions %>%\n # first create a \"nested\" list-column\n mutate(.pred_distn = nested_quantiles(.pred_distn)) %>%\n unnest(.pred_distn) # then unnest it\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 1,288 × 6\n#> geo_value .pred values quantile_levels forecast_date target_date\n#> \n#> 1 ak 0.355 0 0.01 2021-12-31 2022-01-07 \n#> 2 ak 0.355 0 0.025 2021-12-31 2022-01-07 \n#> 3 ak 0.355 0.0371 0.05 2021-12-31 2022-01-07 \n#> 4 ak 0.355 0.123 0.1 2021-12-31 2022-01-07 \n#> 5 ak 0.355 0.174 0.15 2021-12-31 2022-01-07 \n#> 6 ak 0.355 0.211 0.2 2021-12-31 2022-01-07 \n#> # ℹ 1,282 more rows\n```\n:::\n:::\n\n\nAdditional simple adjustments to the basic forecaster can be made using the function:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/unnamed-chunk-6_1cde7efd6c6aec4703b1bae39db842b3'}\n\n```{.r .cell-code}\narx_args_list(\n lags = c(0L, 7L, 14L), ahead = 7L, n_training = Inf,\n forecast_date = NULL, target_date = NULL, quantile_levels = c(0.05, 0.95),\n symmetrize = TRUE, nonneg = TRUE, quantile_by_key = \"geo_value\"\n)\n```\n:::\n\n\n## Changing the engine\n\nSo far, our forecasts have been produced using simple linear regression. But this is not the only way to estimate such a model.\nThe `trainer` argument determines the type of model we want. \nThis takes a [`{parsnip}`](https://parsnip.tidymodels.org) model. The default is linear regression, but we could instead use a random forest with the `{ranger}` package:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/ranger_165b974f4c4580092d3398b1d2bee018'}\n\n```{.r .cell-code}\nout_rf <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n rand_forest(mode = \"regression\")\n)\n```\n:::\n\n\nOr boosted regression trees with `{xgboost}`:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/xgboost_59219c946dd5689d23feb924a64c64be'}\n\n```{.r .cell-code}\nout_gb <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n boost_tree(mode = \"regression\", trees = 20)\n)\n```\n:::\n\n\nOr quantile regression, using our custom forecasting engine `quantile_reg()`:\n\n\n::: {.cell layout-align=\"center\" hash='epipredict_cache/html/quantreg_4e2d216ee037905d2417be2d184f5664'}\n\n```{.r .cell-code}\nout_gb <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n quantile_reg()\n)\n```\n:::\n\n\nFWIW, this last case (using quantile regression), is not far from what the Delphi production forecast team used for its Covid forecasts over the past few years.\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/flatline-forecaster/execute-results/html.json b/_freeze/flatline-forecaster/execute-results/html.json index b3c1406..84a7ed3 100644 --- a/_freeze/flatline-forecaster/execute-results/html.json +++ b/_freeze/flatline-forecaster/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "eb1205f9e5fddf491d80688b96439477", + "hash": "da955731d2bd1d45127a8b350aed8bce", "result": { - "markdown": "# Introducing the flatline forecaster\n\nThe flatline forecaster is a very simple forecasting model intended for `epi_df` data, where the most recent observation is used as the forecast for any future date. In other words, the last observation is propagated forward. Hence, a flat line phenomenon is observed for the point predictions. The predictive intervals are produced from the quantiles of the residuals of such a forecast over all of the training data. By default, these intervals will be obtained separately for each combination of keys (`geo_value` and any additional keys) in the `epi_df`. Thus, the output is a data frame of point (and optionally interval) forecasts at a single unique horizon (`ahead`) for each unique combination of key variables. This forecaster is comparable to the baseline used by the [COVID Forecast Hub](https://covid19forecasthub.org).\n\n## Example of using the flatline forecaster\n\n\n::: {.cell hash='flatline-forecaster_cache/html/unnamed-chunk-1_7ecbc96792f1278d5e72283951cc2098'}\n\n:::\n\n\n\nWe will continue to use the `case_death_rate_subset` dataset that comes with the\n`epipredict` package. In brief, this is a subset of the JHU daily COVID-19 cases\nand deaths by state. While this dataset ranges from Dec 31, 2020 to Dec 31, \n2021, we will only consider a small subset at the end of that range to keep our\nexample relatively simple.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-2_bdb1903df82234e00ed17b2089a9dcf7'}\n\n```{.r .cell-code}\njhu <- case_death_rate_subset %>%\n dplyr::filter(time_value >= as.Date(\"2021-09-01\"))\n\njhu\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6,832 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25.791826\n#> \n#> # A tibble: 6,832 × 4\n#> geo_value time_value case_rate death_rate\n#> * \n#> 1 ak 2021-09-01 75.3 0.198\n#> 2 al 2021-09-01 113. 0.845\n#> 3 ar 2021-09-01 68.5 0.919\n#> 4 as 2021-09-01 0 0 \n#> 5 az 2021-09-01 48.8 0.414\n#> 6 ca 2021-09-01 38.4 0.246\n#> # ℹ 6,826 more rows\n```\n:::\n:::\n\n\n### The basic mechanics of the flatline forecaster\n\nThe simplest way to create and train a flatline forecaster to predict the d\neath rate one week into the future, is to input the `epi_df` and the name of \nthe column from it that we want to predict in the `flatline_forecaster` function.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-3_d9bff13811c7ad711033cc306bce0068'}\n\n```{.r .cell-code}\none_week_ahead <- flatline_forecaster(jhu, outcome = \"death_rate\")\none_week_ahead\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type flatline ══════════════════════════════════════\n#> \n#> This forecaster was fit on 2023-06-19 20:31:13\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-07.\n```\n:::\n:::\n\n\nThe result is both a fitted model object which could be used any time in the \nfuture to create different forecasts, as well as a set of predicted values and\nprediction intervals for each location 7 days after the last available time\nvalue in the data, which is Dec 31, 2021. Note that 7 days is the default\nnumber of time steps ahead of the forecast date in which forecasts should be\nproduced. To change this, you must change the value of the `ahead` parameter\nin the list of additional arguments `flatline_args_list()`. Let's change this\nto 5 days to get some practice.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-4_013bf1041937f63c3b495c7ed5b211b9'}\n\n```{.r .cell-code}\nfive_days_ahead <- flatline_forecaster(\n jhu,\n outcome = \"death_rate\",\n flatline_args_list(ahead = 5L)\n)\n\nfive_days_ahead\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type flatline ══════════════════════════════════════\n#> \n#> This forecaster was fit on 2023-06-19 20:31:14\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-05.\n```\n:::\n:::\n\n\nWe could also specify that we want a 80% predictive interval by changing the \nlevels. The default 0.05 and 0.95 levels/quantiles give us 90% predictive \ninterval.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-5_04fc200f272f42d862a3c5693cc610b6'}\n\n```{.r .cell-code}\nfive_days_ahead <- flatline_forecaster(\n jhu,\n outcome = \"death_rate\",\n flatline_args_list(ahead = 5L, levels = c(0.1, 0.9))\n)\n\nfive_days_ahead\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type flatline ══════════════════════════════════════\n#> \n#> This forecaster was fit on 2023-06-19 20:31:14\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-05.\n```\n:::\n:::\n\n\nTo see the other arguments that you may modify, please see `?flatline_args_list()`. For now, we will move on to looking at the workflow.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-6_23d12864d38c6b2fb9458907db32530f'}\n\n```{.r .cell-code}\nfive_days_ahead$epi_workflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Epi Workflow [trained] ═══════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> Postprocessor: Frosting\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 2 Recipe Steps\n#> \n#> • step_epi_ahead()\n#> • step_training_window()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> Flatline forecaster\n#> \n#> Predictions produced by geo_value resulting in 56 total forecasts.\n#> A total of 7112 residuals are available from the training set.\n#> \n#> ── Postprocessor ────────────────────────────────────────────────────────────\n#> 5 Frosting Layers\n#> \n#> • layer_predict()\n#> • layer_residual_quantiles()\n#> • layer_add_forecast_date()\n#> • layer_add_target_date()\n#> • layer_threshold()\n```\n:::\n:::\n\n\nThe fitted model here was based on minimal pre-processing of the data, \nestimating a flatline model, and then post-processing the results to be \nmeaningful for epidemiological tasks. To look deeper into the pre-processing, \nmodel and processing parts individually, you may use the `$` operator after `epi_workflow`. For example, let's examine the pre-processing part in more detail.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-7_7ebad54744c08badf9d3dac19334894f'}\n\n```{.r .cell-code}\nlibrary(workflows)\nextract_preprocessor(five_days_ahead$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-8_3b5f80e746b0846bc6204aa82a69e29e'}\n\n```\n#> \n#> ── Recipe ───────────────────────────────────────────────────────────────────\n#> \n#> ── Inputs\n#> Number of variables by role\n#> predictor: 3\n#> geo_value: 1\n#> raw: 1\n#> time_value: 1\n#> \n#> ── Operations\n#> • Leading: death_rate by 5\n#> • # of recent observations per key limited to:: Inf\n```\n:::\n\n\n\nUnder Operations, we can see that the pre-processing operations were to lead the\ndeath rate by 5 days (`step_epi_ahead()`) and that the \\# of recent observations\nused in the training window were not limited (in `step_training_window()` as\n`n_training = Inf` in `flatline_args_list()`). You should also see the\nmolded/pre-processed training data.\n\nFor symmetry, let's have a look at the post-processing.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-9_0808824fe21d98fc697b5f19bc27352c'}\n\n```{.r .cell-code}\nextract_frosting(five_days_ahead$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-10_72d5eb3c2909ae3ae55f4bacd445ad68'}\n\n```\n#> \n#> ── Frosting ─────────────────────────────────────────────────────────────────\n#> \n#> ── Layers\n#> • Creating predictions: \"\"\n#> • Resampling residuals for predictive quantiles: \"\" levels 0.1,\n#> 0.9\n#> • Adding forecast date: \"2021-12-31\"\n#> • Adding target date: \"2022-01-05\"\n#> • Thresholding predictions: dplyr::starts_with(\".pred\") to ]0, Inf)\n```\n:::\n\n\n\nThe post-processing operations in the order the that were performed were to create the predictions and the predictive intervals, add the forecast and target dates and bound the predictions at zero.\n\nWe can also easily examine the predictions themselves.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-11_926c2f170768c33a5e9312e52477a59e'}\n\n```{.r .cell-code}\nfive_days_ahead$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.0395 [0.1, 0.9] 2021-12-31 2022-01-05 \n#> 2 al 0.107 [0.1, 0.9] 2021-12-31 2022-01-05 \n#> 3 ar 0.490 [0.1, 0.9] 2021-12-31 2022-01-05 \n#> 4 as 0 [0.1, 0.9] 2021-12-31 2022-01-05 \n#> 5 az 0.608 [0.1, 0.9] 2021-12-31 2022-01-05 \n#> 6 ca 0.142 [0.1, 0.9] 2021-12-31 2022-01-05 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe results above show a distributional forecast produced using data through the end of 2021 for the January 5, 2022. A prediction for the death rate per 100K inhabitants along with a 95% predictive interval is available for every state (`geo_value`).\n\nThe figure below displays the prediction and prediction interval for three sample states: Arizona, New York, and Florida.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-12_338012827629211189d9c8b338a8fa27'}\n\n```{.r .cell-code code-fold=\"true\"}\nsamp_geos <- c(\"az\", \"ny\", \"fl\")\n\nhist <- jhu %>%\n filter(geo_value %in% samp_geos)\n\npreds <- five_days_ahead$predictions %>%\n filter(geo_value %in% samp_geos) %>%\n mutate(q = nested_quantiles(.pred_distn)) %>%\n unnest(q) %>%\n pivot_wider(names_from = tau, values_from = q)\n\nggplot(hist, aes(color = geo_value)) +\n geom_line(aes(time_value, death_rate)) +\n theme_bw() +\n geom_errorbar(data = preds, aes(x = target_date, ymin = `0.1`, ymax = `0.9`)) +\n geom_point(data = preds, aes(target_date, .pred)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d(name = \"\") +\n scale_x_date(date_labels = \"%b %Y\", date_breaks = \"1 month\") +\n facet_grid(geo_value ~ ., scales = \"free_y\") +\n theme(legend.position = \"none\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\")\n```\n\n::: {.cell-output-display}\n![](flatline-forecaster_files/figure-html/unnamed-chunk-12-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThe vertical black line is the forecast date. Here the forecast seems pretty reasonable based on the past observations shown. In cases where the recent past is highly predictive of the near future, a simple flatline forecast may be respectable, but in more complex situations where there is more uncertainty of what's to come, the flatline forecaster may be best relegated to being a baseline model and nothing more.\n\nTake for example what happens when we consider a wider range of target dates. That is, we will now predict for several different horizons or `ahead` values - in our case, 5 to 25 days ahead, inclusive. Since the flatline forecaster function forecasts at a single unique `ahead` value, we can use the `map()` function from `purrr` to apply the forecaster to each ahead value we want to use. Then, we row bind the list of results.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-13_7cca912cb9c507cdcc24b5736638e0bd'}\n\n```{.r .cell-code}\nout_df <- map(1:28, ~ flatline_forecaster(\n epi_data = jhu,\n outcome = \"death_rate\",\n args_list = flatline_args_list(ahead = .x)\n)$predictions) %>%\n list_rbind()\n```\n:::\n\n\nThen, we proceed as we did before. The only difference from before is that we're using `out_df` where we had `five_days_ahead$predictions`.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-14_1548e56fb2990349e1650fd2101d3bf7'}\n\n```{.r .cell-code code-fold=\"true\"}\npreds <- out_df %>%\n filter(geo_value %in% samp_geos) %>%\n mutate(q = nested_quantiles(.pred_distn)) %>%\n unnest(q) %>%\n pivot_wider(names_from = tau, values_from = q)\n\nggplot(hist) +\n geom_line(aes(time_value, death_rate)) +\n geom_ribbon(\n data = preds,\n aes(x = target_date, ymin = `0.05`, ymax = `0.95`, fill = geo_value)\n ) +\n geom_point(data = preds, aes(target_date, .pred, colour = geo_value)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d() +\n scale_fill_viridis_d(alpha = .4) +\n scale_x_date(date_labels = \"%b %Y\", date_breaks = \"1 month\") +\n scale_y_continuous(expand = expansion(c(0, .05))) +\n facet_grid(geo_value ~ ., scales = \"free_y\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](flatline-forecaster_files/figure-html/unnamed-chunk-14-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nNow, you can really see the flat line trend in the predictions. And you may also observe that as we get further away from the forecast date, the more unnerving using a flatline prediction becomes. It feels increasingly unnatural.\n\nSo naturally the choice of forecaster relates to the time frame being considered. In general, using a flatline forecaster makes more sense for short-term forecasts than for long-term forecasts and for periods of great stability than in less stable times. Realistically, periods of great stability are rare. Moreover, in our model of choice we want to take into account more information about the past than just what happened at the most recent time point. So simple forecasters like the flatline forecaster don't cut it as actual contenders in many real-life situations. However, they are not useless, just used for a different purpose. A simple model is often used to compare a more complex model to, which is why you may have seen such a model used as a baseline in the [COVID Forecast Hub](https://covid19forecasthub.org). The following [blog post](https://delphi.cmu.edu/blog/2021/09/30/on-the-predictability-of-covid-19/#ensemble-forecast-performance) from Delphi explores the Hub's ensemble accuracy relative to such a baseline model.\n\n## What we've learned in a nutshell\n\nThough the flatline forecaster is a very basic model with limited customization, it is about as steady and predictable as a model can get. So it provides a good reference or baseline to compare more complicated models to.\n", + "markdown": "# Introducing the flatline forecaster\n\nThe flatline forecaster is a very simple forecasting model intended for `epi_df` data, where the most recent observation is used as the forecast for any future date. In other words, the last observation is propagated forward. Hence, a flat line phenomenon is observed for the point predictions. The predictive intervals are produced from the quantiles of the residuals of such a forecast over all of the training data. By default, these intervals will be obtained separately for each combination of keys (`geo_value` and any additional keys) in the `epi_df`. Thus, the output is a data frame of point (and optionally interval) forecasts at a single unique horizon (`ahead`) for each unique combination of key variables. This forecaster is comparable to the baseline used by the [COVID Forecast Hub](https://covid19forecasthub.org).\n\n## Example of using the flatline forecaster\n\n\n::: {.cell}\n\n:::\n\n\n\nWe will continue to use the `case_death_rate_subset` dataset that comes with the\n`epipredict` package. In brief, this is a subset of the JHU daily COVID-19 cases\nand deaths by state. While this dataset ranges from Dec 31, 2020 to Dec 31, \n2021, we will only consider a small subset at the end of that range to keep our\nexample relatively simple.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-2_bdb1903df82234e00ed17b2089a9dcf7'}\n\n```{.r .cell-code}\njhu <- case_death_rate_subset %>%\n dplyr::filter(time_value >= as.Date(\"2021-09-01\"))\n\njhu\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6,832 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25\n#> \n#> # A tibble: 6,832 × 4\n#> geo_value time_value case_rate death_rate\n#> * \n#> 1 ak 2021-09-01 75.3 0.198\n#> 2 al 2021-09-01 113. 0.845\n#> 3 ar 2021-09-01 68.5 0.919\n#> 4 as 2021-09-01 0 0 \n#> 5 az 2021-09-01 48.8 0.414\n#> 6 ca 2021-09-01 38.4 0.246\n#> # ℹ 6,826 more rows\n```\n:::\n:::\n\n\n### The basic mechanics of the flatline forecaster\n\nThe simplest way to create and train a flatline forecaster to predict the d\neath rate one week into the future, is to input the `epi_df` and the name of \nthe column from it that we want to predict in the `flatline_forecaster` function.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-3_d9bff13811c7ad711033cc306bce0068'}\n\n```{.r .cell-code}\none_week_ahead <- flatline_forecaster(jhu, outcome = \"death_rate\")\none_week_ahead\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type flatline ══════════════════════════════════════\n#> \n#> This forecaster was fit on 2023-12-15 04:53:28\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-07.\n```\n:::\n:::\n\n\nThe result is both a fitted model object which could be used any time in the \nfuture to create different forecasts, as well as a set of predicted values and\nprediction intervals for each location 7 days after the last available time\nvalue in the data, which is Dec 31, 2021. Note that 7 days is the default\nnumber of time steps ahead of the forecast date in which forecasts should be\nproduced. To change this, you must change the value of the `ahead` parameter\nin the list of additional arguments `flatline_args_list()`. Let's change this\nto 5 days to get some practice.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-4_013bf1041937f63c3b495c7ed5b211b9'}\n\n```{.r .cell-code}\nfive_days_ahead <- flatline_forecaster(\n jhu,\n outcome = \"death_rate\",\n flatline_args_list(ahead = 5L)\n)\n\nfive_days_ahead\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type flatline ══════════════════════════════════════\n#> \n#> This forecaster was fit on 2023-12-15 04:53:28\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-05.\n```\n:::\n:::\n\n\nWe could also specify that we want a 80% predictive interval by changing the\nquantile levels. The default 0.05 and 0.95 levels/quantiles give us 90%\npredictive intervals.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-5_d198fa2e7b1019a625462597c0bb5e44'}\n\n```{.r .cell-code}\nfive_days_ahead <- flatline_forecaster(\n jhu,\n outcome = \"death_rate\",\n flatline_args_list(ahead = 5L, quantile_levels = c(0.1, 0.9))\n)\n\nfive_days_ahead\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ══ A basic forecaster of type flatline ══════════════════════════════════════\n#> \n#> This forecaster was fit on 2023-12-15 04:53:29\n#> \n#> Training data was an `epi_df` with\n#> • Geography: state,\n#> • Time type: day,\n#> • Using data up-to-date as of: 2022-05-31 12:08:25.\n#> \n#> ── Predictions ──────────────────────────────────────────────────────────────\n#> \n#> A total of 56 predictions are available for\n#> • 56 unique geographic regions,\n#> • At forecast dates: 2021-12-31,\n#> • For target dates: 2022-01-05.\n```\n:::\n:::\n\n\nTo see the other arguments that you may modify, please see `?flatline_args_list()`. For now, we will move on to looking at the workflow.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-6_f92b9552034cb6e9cb468a203402d459'}\n\n```{.r .cell-code}\nfive_days_ahead$epi_workflow\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-7_7fcfe8b9cf9e732fc9711a48dd0a6447'}\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Epi Workflow [trained] ═══════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> Postprocessor: Frosting\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 2 Recipe Steps\n#> \n#> 1. step_epi_ahead()\n#> 2. step_training_window()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> Flatline forecaster\n#> \n#> Predictions produced by geo_value resulting in 56 total forecasts.\n#> A total of 7112 residuals are available from the training set.\n#> \n#> ── Postprocessor ────────────────────────────────────────────────────────────\n#> 5 Frosting Layers\n#> \n#> 1. layer_predict()\n#> 2. layer_residual_quantiles()\n#> 3. layer_add_forecast_date()\n#> 4. layer_add_target_date()\n#> 5. layer_threshold()\n```\n:::\n:::\n\n\nThe fitted model here was based on minimal pre-processing of the data, \nestimating a flatline model, and then post-processing the results to be \nmeaningful for epidemiological tasks. To look deeper into the pre-processing, \nmodel and processing parts individually, you may use the `$` operator after `epi_workflow`. For example, let's examine the pre-processing part in more detail.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-8_fbd01388131646dfe7fbad6fff23bad6'}\n\n```{.r .cell-code}\nlibrary(workflows)\nextract_preprocessor(five_days_ahead$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-9_538e8da583cba9d8903f91cbbc6e57a4'}\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ── Epi Recipe ───────────────────────────────────────────────────────────────\n#> \n#> ── Inputs \n#> Number of variables by role\n#> predictor: 3\n#> geo_value: 1\n#> raw: 1\n#> time_value: 1\n#> \n#> ── Operations \n#> 1. Leading: death_rate by 5\n#> 2. • # of recent observations per key limited to:: Inf\n```\n:::\n:::\n\n\n\nUnder Operations, we can see that the pre-processing operations were to lead the\ndeath rate by 5 days (`step_epi_ahead()`) and that the \\# of recent observations\nused in the training window were not limited (in `step_training_window()` as\n`n_training = Inf` in `flatline_args_list()`). You should also see the\nmolded/pre-processed training data.\n\nFor symmetry, let's have a look at the post-processing.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-10_1ffcc771d298d8777adb14da4c33af11'}\n\n```{.r .cell-code}\nextract_frosting(five_days_ahead$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-11_5f0f138a1844d85ed0f9ee1615974a55'}\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ── Frosting ─────────────────────────────────────────────────────────────────\n#> \n#> ── Layers \n#> 1. Creating predictions: \"\"\n#> 2. Resampling residuals for predictive quantiles: \"\"\n#> quantile_levels 0.1, 0.9\n#> 3. Adding forecast date: \"2021-12-31\"\n#> 4. Adding target date: \"2022-01-05\"\n#> 5. Thresholding predictions: dplyr::starts_with(\".pred\") to [0, Inf)\n```\n:::\n:::\n\n\n\nThe post-processing operations in the order the that were performed were to create the predictions and the predictive intervals, add the forecast and target dates and bound the predictions at zero.\n\nWe can also easily examine the predictions themselves.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-12_7b35816dbfa62bfcb0cd758284b972ae'}\n\n```{.r .cell-code}\nfive_days_ahead$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.0395 quantiles(0.11)[2] 2021-12-31 2022-01-05 \n#> 2 al 0.107 quantiles(0.14)[2] 2021-12-31 2022-01-05 \n#> 3 ar 0.490 quantiles(0.49)[2] 2021-12-31 2022-01-05 \n#> 4 as 0 quantiles(0.09)[2] 2021-12-31 2022-01-05 \n#> 5 az 0.608 quantiles(0.61)[2] 2021-12-31 2022-01-05 \n#> 6 ca 0.142 quantiles(0.16)[2] 2021-12-31 2022-01-05 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe results above show a distributional forecast produced using data through the end of 2021 for the January 5, 2022. A prediction for the death rate per 100K inhabitants along with a 95% predictive interval is available for every state (`geo_value`).\n\nThe figure below displays the prediction and prediction interval for three sample states: Arizona, New York, and Florida.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-13_6e5e32ad4dda558766ca9090bf4e927c'}\n\n```{.r .cell-code code-fold=\"true\"}\nsamp_geos <- c(\"az\", \"ny\", \"fl\")\n\nhist <- jhu %>%\n filter(geo_value %in% samp_geos)\n\npreds <- five_days_ahead$predictions %>%\n filter(geo_value %in% samp_geos) %>%\n pivot_quantiles_wider(.pred_distn)\n\nggplot(hist, aes(color = geo_value)) +\n geom_line(aes(time_value, death_rate)) +\n theme_bw() +\n geom_errorbar(data = preds, aes(x = target_date, ymin = `0.1`, ymax = `0.9`)) +\n geom_point(data = preds, aes(target_date, .pred)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d(name = \"\") +\n scale_x_date(date_labels = \"%b %Y\", date_breaks = \"1 month\") +\n facet_grid(geo_value ~ ., scales = \"free_y\") +\n theme(legend.position = \"none\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\")\n```\n\n::: {.cell-output-display}\n![](flatline-forecaster_files/figure-html/unnamed-chunk-13-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThe vertical black line is the forecast date. Here the forecast seems pretty reasonable based on the past observations shown. In cases where the recent past is highly predictive of the near future, a simple flatline forecast may be respectable, but in more complex situations where there is more uncertainty of what's to come, the flatline forecaster may be best relegated to being a baseline model and nothing more.\n\nTake for example what happens when we consider a wider range of target dates. That is, we will now predict for several different horizons or `ahead` values - in our case, 5 to 25 days ahead, inclusive. Since the flatline forecaster function forecasts at a single unique `ahead` value, we can use the `map()` function from `purrr` to apply the forecaster to each ahead value we want to use. Then, we row bind the list of results.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-14_fef7cecf6c6cefba67f0fca89b6f3083'}\n\n```{.r .cell-code}\nout_df <- map(1:28, ~ flatline_forecaster(\n epi_data = jhu,\n outcome = \"death_rate\",\n args_list = flatline_args_list(ahead = .x)\n)$predictions) %>%\n list_rbind()\n```\n:::\n\n\nThen, we proceed as we did before. The only difference from before is that we're using `out_df` where we had `five_days_ahead$predictions`.\n\n\n::: {.cell layout-align=\"center\" hash='flatline-forecaster_cache/html/unnamed-chunk-15_3eaf57af002d2a17112638dd3f149e0e'}\n\n```{.r .cell-code code-fold=\"true\"}\npreds <- out_df %>%\n filter(geo_value %in% samp_geos) %>%\n pivot_quantiles_wider(.pred_distn)\n\nggplot(hist) +\n geom_line(aes(time_value, death_rate)) +\n geom_ribbon(\n data = preds,\n aes(x = target_date, ymin = `0.05`, ymax = `0.95`, fill = geo_value)\n ) +\n geom_point(data = preds, aes(target_date, .pred, colour = geo_value)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d() +\n scale_fill_viridis_d(alpha = .4) +\n scale_x_date(date_labels = \"%b %Y\", date_breaks = \"1 month\") +\n scale_y_continuous(expand = expansion(c(0, .05))) +\n facet_grid(geo_value ~ ., scales = \"free_y\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](flatline-forecaster_files/figure-html/unnamed-chunk-15-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nNow, you can really see the flat line trend in the predictions. And you may also observe that as we get further away from the forecast date, the more unnerving using a flatline prediction becomes. It feels increasingly unnatural.\n\nSo naturally the choice of forecaster relates to the time frame being considered. In general, using a flatline forecaster makes more sense for short-term forecasts than for long-term forecasts and for periods of great stability than in less stable times. Realistically, periods of great stability are rare. Moreover, in our model of choice we want to take into account more information about the past than just what happened at the most recent time point. So simple forecasters like the flatline forecaster don't cut it as actual contenders in many real-life situations. However, they are not useless, just used for a different purpose. A simple model is often used to compare a more complex model to, which is why you may have seen such a model used as a baseline in the [COVID Forecast Hub](https://covid19forecasthub.org). The following [blog post](https://delphi.cmu.edu/blog/2021/09/30/on-the-predictability-of-covid-19/#ensemble-forecast-performance) from Delphi explores the Hub's ensemble accuracy relative to such a baseline model.\n\n## What we've learned in a nutshell\n\nThough the flatline forecaster is a very basic model with limited customization, it is about as steady and predictable as a model can get. So it provides a good reference or baseline to compare more complicated models to.\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/flatline-forecaster/figure-html/unnamed-chunk-15-1.svg b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-15-1.svg new file mode 100644 index 0000000..9d59241 --- /dev/null +++ b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-15-1.svg @@ -0,0 +1,879 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_freeze/forecast-framework/execute-results/html.json b/_freeze/forecast-framework/execute-results/html.json index 808585b..b2469df 100644 --- a/_freeze/forecast-framework/execute-results/html.json +++ b/_freeze/forecast-framework/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "6093bfe321dbf60920ca2db6eb89d5bd", + "hash": "67c8411020f78b45d3ee55625996d060", "result": { - "markdown": "# Inner workings of the framework\n\n\n\n\n\nUnderneath the hood, the `arx_forecaster()` (and all our canned\nforecasters) creates (and returns) an `epi_workflow`. \nEssentially, this is a big S3 object that wraps up the 4 modular steps \n(preprocessing - postprocessing) described in the last chapter.\n\n1. Preprocessor: make transformations to the data before model training\n2. Trainer: train a model on data, resulting in a fitted model object\n3. Predictor: make predictions, using a fitted model object and processed test data\n4. Postprocessor: manipulate or transform the predictions before returning\n\nLet's investigate how these interact with `{tidymodels}` and why it's important\nto think of forecasting this way. To have something to play with, we'll continue\nto examine the data and an estimated canned corecaster.\n\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/demo-workflow_38ac956904953873a24c7e4dd0648ab5'}\n\n```{.r .cell-code}\njhu <- case_death_rate_subset %>%\n filter(time_value >= max(time_value) - 30)\n\nout_gb <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n boost_tree(mode = \"regression\", trees = 20)\n)\n```\n:::\n\n\n## Preprocessing\n\nPreprocessing is accomplished through a `recipe` (imagine baking a cake) as \nprovided in the [`{recipes}`](https://recipes.tidymodels.org) package. \nWe've made a few modifications (to handle\npanel data) as well as added some additional options. The recipe gives a\nspecification of how to handle training data. Think of it like a fancified\n`formula` that you would pass to `lm()`: `y ~ x1 + log(x2)`. In general, \nthere are 2 extensions to the `formula` that `{recipes}` handles: \n\n 1. Doing transformations of both training and test data that can always be \n applied. These are things like taking the log of a variable, leading or \n lagging, filtering out rows, handling dummy variables, etc.\n 2. Using statistics from the training data to eventually process test data. \n This is a major benefit of `{recipes}`. It prevents what the tidy team calls\n \"data leakage\". A simple example is centering a predictor by its mean. We\n need to store the mean of the predictor from the training data and use that\n value on the test data rather than accidentally calculating the mean of\n the test predictor for centering.\n \nA recipe is processed in 2 steps, first it is \"prepped\". This calculates and\nstores any intermediate statistics necessary for use on the test data. \nThen it is \"baked\"\nresulting in training data ready for passing into a statistical model (like `lm`).\n\nWe have introduced an `epi_recipe`. It's just a `recipe` that knows how to handle\nthe `time_value`, `geo_value`, and any additional keys so that these are available\nwhen necessary.\n\nThe `epi_recipe` from `out_gb` can be extracted from the result:\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-2_701136b2c4657141ea38354f5aad0130'}\n\n```{.r .cell-code}\nlibrary(workflows)\nlibrary(recipes)\nextract_recipe(out_gb$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-3_003b8015fdb7de99adc7e1683848927f'}\n\n```\n#> \n#> ── Recipe ─────────────────────────────────────────────────────────\n#> \n#> ── Inputs\n#> Number of variables by role\n#> raw: 2\n#> geo_value: 1\n#> time_value: 1\n#> \n#> ── Training information\n#> Training data contained 1736 data points and no incomplete rows.\n#> \n#> ── Operations\n#> • Lagging: case_rate by 0, 7, 14 | Trained\n#> • Lagging: death_rate by 0, 7, 14 | Trained\n#> • Leading: death_rate by 7 | Trained\n#> • Removing rows with NA values in: lag_0_case_rate, ... | Trained\n#> • Removing rows with NA values in: ahead_7_death_rate | Trained\n#> • # of recent observations per key limited to:: Inf | Trained\n```\n:::\n\n\n\nThe \"Inputs\" are the original `epi_df` and the \"roles\" that these are assigned.\nNone of these are predictors or outcomes. Those will be created \nby the recipe when it is prepped. The \"Operations\" are the sequence of \ninstructions to create the cake (baked training data).\nHere we create lagged predictors, lead the outcome, and then remove `NA`s.\nSome models like `lm` internally handle `NA`s, but not everything does, so we\ndeal with them explicitly. The code to do this (inside the forecaster) is\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-4_7b1a5f279cb0216eb98e81324ade71aa'}\n\n```{.r .cell-code}\ner <- epi_recipe(jhu) %>%\n step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%\n step_epi_ahead(death_rate, ahead = 7) %>%\n step_epi_naomit()\n```\n:::\n\n\nWhile `{recipes}` provides a function `step_lag()`, it assumes that the data\nhave no breaks in the sequence of `time_values`. This is a bit dangerous, so\nwe avoid that behaviour. Our `lag/ahead` functions also appropriately adjust the\namount of data to avoid accidentally dropping recent predictors from the test\ndata.\n\n## The model specification\n\nUsers familiar with the `{parsnip}` package will have no trouble here.\nBasically, `{parsnip}` unifies the function signature across statistical models.\nFor example, `lm()` \"likes\" to work with formulas, but `glmnet::glmnet()` uses\n`x` and `y` for predictors and response. `{parsnip}` is agnostic. Both of these\ndo \"linear regression\". Above we switched from `lm()` to `xgboost()` without \nany issue despite the fact that these functions couldn't be more different.\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-5_904b72e53b7ebec817c2ff42657fface'}\n\n```{.r .cell-code}\nlm(\n formula, data, subset, weights, na.action,\n method = \"qr\",\n model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,\n contrasts = NULL, offset, ...\n)\n\nxgboost(\n data = NULL, label = NULL, missing = NA, weight = NULL,\n params = list(), nrounds, verbose = 1, print_every_n = 1L,\n early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,\n save_name = \"xgboost.model\", xgb_model = NULL, callbacks = list(),\n ...\n)\n```\n:::\n\n\n`{epipredict}` provides a few engines/modules like `flatline()` and \n`quantile_reg()` to power the `flatline_forecaster()` and provide quantile \nregression, but you should be able to use almost any available models\nlisted [here](https://www.tidymodels.org/find/parsnip/).\n\n\nTo estimate (fit) a preprocessed model, one calls `fit()` on the `epi_workflow`.\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-6_aa2a47d336b8de9c69394816f02e101e'}\n\n```{.r .cell-code}\newf <- epi_workflow(er, linear_reg()) %>% fit(jhu)\n```\n:::\n\n\n## Predicting and Postprocessing (bound together)\n\nTo stretch the metaphor of preparing a cake to its natural limits, we have\ncreated postprocessing functionality called \"frosting\". Much like the recipe,\neach postprocessing operation is a \"layer\" and we \"slather\" these onto our \nbaked cake. To fix ideas, below is the postprocessing `frosting` for \n`arx_forecaster()`\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-7_de6af11afc0b108fae8b915da6125069'}\n\n```{.r .cell-code}\nextract_frosting(out_gb$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-8_9b4f4402e9fc39b22a2aca0aeca7a049'}\n\n```\n#> \n#> ── Frosting ─────────────────────────────────────────────────────────────────\n#> \n#> ── Layers\n#> • Creating predictions: \"\"\n#> • Resampling residuals for predictive quantiles: \"\" levels 0.05,\n#> 0.95\n#> • Adding forecast date: \"2021-12-31\"\n#> • Adding target date: \"2022-01-07\"\n#> • Thresholding predictions: dplyr::starts_with(\".pred\") to ]0, Inf)\n```\n:::\n\n\n\nHere we have 5 layers of frosting. The first generates the forecasts from the test data.\nThe second uses quantiles of the residuals to create distributional\nforecasts. The next two add columns for the date the forecast was made and the\ndate for which it is intended to occur. Because we are predicting rates, they \nshould be non-negative, so the last layer thresholds both predicted values and\nintervals at 0. The code to do this (inside the forecaster) is\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-9_9d652e9268c146341e4fb33091fd643d'}\n\n```{.r .cell-code}\nf <- frosting() %>%\n layer_predict() %>%\n layer_residual_quantiles(\n probs = c(.01, .025, seq(.05, .95, by = .05), .975, .99),\n symmetrize = TRUE\n ) %>%\n layer_add_forecast_date() %>%\n layer_add_target_date() %>%\n layer_threshold(starts_with(\".pred\"))\n```\n:::\n\n\nAt predict time, we add this object onto the `epi_workflow` and call `predict()`\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-10_1af407921661ddcfded119d9726e1a59'}\n\n```{.r .cell-code}\ntest_data <- get_test_data(er, jhu)\newf %>%\n add_frosting(f) %>%\n predict(test_data)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 56 x 6 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25.791826\n#> \n#> # A tibble: 56 × 6\n#> geo_value time_value .pred .pred_distn forecast_date target_date\n#> * \n#> 1 ak 2021-12-31 0.355 [0.01, 0.99] 2021-12-31 2022-01-07 \n#> 2 al 2021-12-31 0.325 [0.01, 0.99] 2021-12-31 2022-01-07 \n#> 3 ar 2021-12-31 0.496 [0.01, 0.99] 2021-12-31 2022-01-07 \n#> 4 as 2021-12-31 0.0836 [0.01, 0.99] 2021-12-31 2022-01-07 \n#> 5 az 2021-12-31 0.614 [0.01, 0.99] 2021-12-31 2022-01-07 \n#> 6 ca 2021-12-31 0.327 [0.01, 0.99] 2021-12-31 2022-01-07 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe above `get_test_data()` function examines the recipe and ensures that enough\ntest data is available to create the necessary lags and produce a prediction\nfor the desired future time point (after the end of the training data). This mimics\nwhat would happen if `jhu` contained the most recent available historical data and\nwe wanted to actually predict the future. We could have instead used any test data\nthat contained the necessary predictors.\n\n:::{.callout-note}\nIn the predictions above, you'll see a `time_value` column. That's because we \ncould use **any training data**. We happened to use training data corresponding\nto the most recent available, and it's lags. But we could have instead used\nlast week's or we could use the data that arrives next year, or we could use multiple\n`time_values` for multiple locations. This is completely allowed, though not\nnecessarily what you expect.\n\nIn production forecasting, you'd probably reestimate the model and produce new\npredictions whenever new data arrives. This is exactly what all the canned \nforecasters we provide do. So those strip out the `time_value` column.\n\nBut the next most likely procedure would be\nto feed your previously estimated model (without refitting) the new data.\nTo do this, you'd just call `get_test_data()` on that new data. And the \n`time_value` would still be the same as your `forecast_date`.\n\nGetting many forecasts (multiple `time_values`) for each location, is not\nexactly a typical desire in this context. But it's also not unheard of, so\nit is possible (and analogous to standard, non-time series forecasting). \n:::\n\n\n## Conclusion\n\nInternally, we provide some canned forecaster functions to create reasonable forecasts. \nBut ideally, a user could create their own forecasters by building up the \ncomponents we provide. In other chapters, we try to walk through some of these\ncustomizations. \n\nTo illustrate everything above, here is (roughly) the code for the \n`arx_forecaster()` to predict the death rate, 1 week ahead:\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-11_d45e3276831a1b40877b2251297fbb9d'}\n\n```{.r .cell-code}\nr <- epi_recipe(jhu) %>%\n step_epi_ahead(death_rate, ahead = 7) %>%\n step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%\n step_epi_naomit()\n\nlatest <- get_test_data(r, jhu)\n\nf <- frosting() %>%\n layer_predict() %>%\n layer_residual_quantiles() %>%\n layer_add_forecast_date() %>%\n layer_add_target_date() %>%\n layer_threshold(starts_with(\".pred\"))\n\neng <- linear_reg()\nwf <- epi_workflow(r, eng, f) %>% fit(jhu)\npreds <- predict(wf, latest)\n```\n:::\n\nThe code for `arx_forecaster()` simply generalizes this, passing along arguments as needed.\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-12_b7f75d610d9c4f0ced30040e9aa3a481'}\n\n```{.r .cell-code}\npreds\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 56 x 6 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25.791826\n#> \n#> # A tibble: 56 × 6\n#> geo_value time_value .pred .pred_distn forecast_date target_date\n#> * \n#> 1 ak 2021-12-31 36.4 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 2 al 2021-12-31 89.9 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 3 ar 2021-12-31 82.6 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 4 as 2021-12-31 0 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 5 az 2021-12-31 58.3 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> 6 ca 2021-12-31 84.4 [0.05, 0.95] 2021-12-31 2022-01-07 \n#> # ℹ 50 more rows\n```\n:::\n:::\n", + "markdown": "# Inner workings of the framework\n\n\n\n\n\nUnderneath the hood, the `arx_forecaster()` (and all our canned\nforecasters) creates (and returns) an `epi_workflow`. \nEssentially, this is a big S3 object that wraps up the 4 modular steps \n(preprocessing - postprocessing) described in the last chapter.\n\n1. Preprocessor: make transformations to the data before model training\n2. Trainer: train a model on data, resulting in a fitted model object\n3. Predictor: make predictions, using a fitted model object and processed test data\n4. Postprocessor: manipulate or transform the predictions before returning\n\nLet's investigate how these interact with `{tidymodels}` and why it's important\nto think of forecasting this way. To have something to play with, we'll continue\nto examine the data and an estimated canned corecaster.\n\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/demo-workflow_38ac956904953873a24c7e4dd0648ab5'}\n\n```{.r .cell-code}\njhu <- case_death_rate_subset %>%\n filter(time_value >= max(time_value) - 30)\n\nout_gb <- arx_forecaster(\n jhu, \"death_rate\", c(\"case_rate\", \"death_rate\"),\n boost_tree(mode = \"regression\", trees = 20)\n)\n```\n:::\n\n\n## Preprocessing\n\nPreprocessing is accomplished through a `recipe` (imagine baking a cake) as \nprovided in the [`{recipes}`](https://recipes.tidymodels.org) package. \nWe've made a few modifications (to handle\npanel data) as well as added some additional options. The recipe gives a\nspecification of how to handle training data. Think of it like a fancified\n`formula` that you would pass to `lm()`: `y ~ x1 + log(x2)`. In general, \nthere are 2 extensions to the `formula` that `{recipes}` handles: \n\n 1. Doing transformations of both training and test data that can always be \n applied. These are things like taking the log of a variable, leading or \n lagging, filtering out rows, handling dummy variables, etc.\n 2. Using statistics from the training data to eventually process test data. \n This is a major benefit of `{recipes}`. It prevents what the tidy team calls\n \"data leakage\". A simple example is centering a predictor by its mean. We\n need to store the mean of the predictor from the training data and use that\n value on the test data rather than accidentally calculating the mean of\n the test predictor for centering.\n \nA recipe is processed in 2 steps, first it is \"prepped\". This calculates and\nstores any intermediate statistics necessary for use on the test data. \nThen it is \"baked\"\nresulting in training data ready for passing into a statistical model (like `lm`).\n\nWe have introduced an `epi_recipe`. It's just a `recipe` that knows how to handle\nthe `time_value`, `geo_value`, and any additional keys so that these are available\nwhen necessary.\n\nThe `epi_recipe` from `out_gb` can be extracted from the result:\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-2_701136b2c4657141ea38354f5aad0130'}\n\n```{.r .cell-code}\nlibrary(workflows)\nlibrary(recipes)\nextract_recipe(out_gb$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-3_7183beb656c33b42c6c5f5d765805857'}\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ── Epi Recipe ───────────────────────────────────────────────────────────────\n#> \n#> ── Inputs \n#> Number of variables by role\n#> raw: 2\n#> geo_value: 1\n#> time_value: 1\n#> \n#> ── Training information \n#> Training data contained 1736 data points and no incomplete rows.\n#> \n#> ── Operations \n#> 1. Lagging: case_rate by 0, 7, 14 | Trained\n#> 2. Lagging: death_rate by 0, 7, 14 | Trained\n#> 3. Leading: death_rate by 7 | Trained\n#> 4. • Removing rows with NA values in: lag_0_case_rate, ... | Trained\n#> 5. • Removing rows with NA values in: ahead_7_death_rate | Trained\n#> 6. • # of recent observations per key limited to:: Inf | Trained\n```\n:::\n:::\n\n\n\nThe \"Inputs\" are the original `epi_df` and the \"roles\" that these are assigned.\nNone of these are predictors or outcomes. Those will be created \nby the recipe when it is prepped. The \"Operations\" are the sequence of \ninstructions to create the cake (baked training data).\nHere we create lagged predictors, lead the outcome, and then remove `NA`s.\nSome models like `lm` internally handle `NA`s, but not everything does, so we\ndeal with them explicitly. The code to do this (inside the forecaster) is\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-4_7b1a5f279cb0216eb98e81324ade71aa'}\n\n```{.r .cell-code}\ner <- epi_recipe(jhu) %>%\n step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%\n step_epi_ahead(death_rate, ahead = 7) %>%\n step_epi_naomit()\n```\n:::\n\n\nWhile `{recipes}` provides a function `step_lag()`, it assumes that the data\nhave no breaks in the sequence of `time_values`. This is a bit dangerous, so\nwe avoid that behaviour. Our `lag/ahead` functions also appropriately adjust the\namount of data to avoid accidentally dropping recent predictors from the test\ndata.\n\n## The model specification\n\nUsers familiar with the `{parsnip}` package will have no trouble here.\nBasically, `{parsnip}` unifies the function signature across statistical models.\nFor example, `lm()` \"likes\" to work with formulas, but `glmnet::glmnet()` uses\n`x` and `y` for predictors and response. `{parsnip}` is agnostic. Both of these\ndo \"linear regression\". Above we switched from `lm()` to `xgboost()` without \nany issue despite the fact that these functions couldn't be more different.\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-5_904b72e53b7ebec817c2ff42657fface'}\n\n```{.r .cell-code}\nlm(\n formula, data, subset, weights, na.action,\n method = \"qr\",\n model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,\n contrasts = NULL, offset, ...\n)\n\nxgboost(\n data = NULL, label = NULL, missing = NA, weight = NULL,\n params = list(), nrounds, verbose = 1, print_every_n = 1L,\n early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,\n save_name = \"xgboost.model\", xgb_model = NULL, callbacks = list(),\n ...\n)\n```\n:::\n\n\n`{epipredict}` provides a few engines/modules like `flatline()` and \n`quantile_reg()` to power the `flatline_forecaster()` and provide quantile \nregression, but you should be able to use almost any available models\nlisted [here](https://www.tidymodels.org/find/parsnip/).\n\n\nTo estimate (fit) a preprocessed model, one calls `fit()` on the `epi_workflow`.\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-6_aa2a47d336b8de9c69394816f02e101e'}\n\n```{.r .cell-code}\newf <- epi_workflow(er, linear_reg()) %>% fit(jhu)\n```\n:::\n\n\n## Predicting and Postprocessing (bound together)\n\nTo stretch the metaphor of preparing a cake to its natural limits, we have\ncreated postprocessing functionality called \"frosting\". Much like the recipe,\neach postprocessing operation is a \"layer\" and we \"slather\" these onto our \nbaked cake. To fix ideas, below is the postprocessing `frosting` for \n`arx_forecaster()`\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-7_de6af11afc0b108fae8b915da6125069'}\n\n```{.r .cell-code}\nextract_frosting(out_gb$epi_workflow)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-8_f0ba4b4792cb1d279128e0e42125054f'}\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> ── Frosting ─────────────────────────────────────────────────────────────────\n#> \n#> ── Layers \n#> 1. Creating predictions: \"\"\n#> 2. Resampling residuals for predictive quantiles: \"\"\n#> quantile_levels 0.05, 0.95\n#> 3. Adding forecast date: \"2021-12-31\"\n#> 4. Adding target date: \"2022-01-07\"\n#> 5. Thresholding predictions: dplyr::starts_with(\".pred\") to [0, Inf)\n```\n:::\n:::\n\n\n\nHere we have 5 layers of frosting. The first generates the forecasts from the test data.\nThe second uses quantiles of the residuals to create distributional\nforecasts. The next two add columns for the date the forecast was made and the\ndate for which it is intended to occur. Because we are predicting rates, they \nshould be non-negative, so the last layer thresholds both predicted values and\nintervals at 0. The code to do this (inside the forecaster) is\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-9_20344dac94d93271078a00ddd04a0974'}\n\n```{.r .cell-code}\nf <- frosting() %>%\n layer_predict() %>%\n layer_residual_quantiles(\n quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99),\n symmetrize = TRUE\n ) %>%\n layer_add_forecast_date() %>%\n layer_add_target_date() %>%\n layer_threshold(starts_with(\".pred\"))\n```\n:::\n\n\nAt predict time, we add this object onto the `epi_workflow` and call `predict()`\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-10_1af407921661ddcfded119d9726e1a59'}\n\n```{.r .cell-code}\ntest_data <- get_test_data(er, jhu)\newf %>%\n add_frosting(f) %>%\n predict(test_data)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 56 x 6 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25\n#> \n#> # A tibble: 56 × 6\n#> geo_value time_value .pred .pred_distn forecast_date target_date\n#> * \n#> 1 ak 2021-12-31 0.355 quantiles(0.36)[23] 2021-12-31 2022-01-07 \n#> 2 al 2021-12-31 0.325 quantiles(0.32)[23] 2021-12-31 2022-01-07 \n#> 3 ar 2021-12-31 0.496 quantiles(0.5)[23] 2021-12-31 2022-01-07 \n#> 4 as 2021-12-31 0.0836 quantiles(0.08)[23] 2021-12-31 2022-01-07 \n#> 5 az 2021-12-31 0.614 quantiles(0.61)[23] 2021-12-31 2022-01-07 \n#> 6 ca 2021-12-31 0.327 quantiles(0.33)[23] 2021-12-31 2022-01-07 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe above `get_test_data()` function examines the recipe and ensures that enough\ntest data is available to create the necessary lags and produce a prediction\nfor the desired future time point (after the end of the training data). This mimics\nwhat would happen if `jhu` contained the most recent available historical data and\nwe wanted to actually predict the future. We could have instead used any test data\nthat contained the necessary predictors.\n\n:::{.callout-note}\nIn the predictions above, you'll see a `time_value` column. That's because we \ncould use **any training data**. We happened to use training data corresponding\nto the most recent available, and it's lags. But we could have instead used\nlast week's or we could use the data that arrives next year, or we could use multiple\n`time_values` for multiple locations. This is completely allowed, though not\nnecessarily what you expect.\n\nIn production forecasting, you'd probably reestimate the model and produce new\npredictions whenever new data arrives. This is exactly what all the canned \nforecasters we provide do. So those strip out the `time_value` column.\n\nBut the next most likely procedure would be\nto feed your previously estimated model (without refitting) the new data.\nTo do this, you'd just call `get_test_data()` on that new data. And the \n`time_value` would still be the same as your `forecast_date`.\n\nGetting many forecasts (multiple `time_values`) for each location, is not\nexactly a typical desire in this context. But it's also not unheard of, so\nit is possible (and analogous to standard, non-time series forecasting). \n:::\n\n\n## Conclusion\n\nInternally, we provide some canned forecaster functions to create reasonable forecasts. \nBut ideally, a user could create their own forecasters by building up the \ncomponents we provide. In other chapters, we try to walk through some of these\ncustomizations. \n\nTo illustrate everything above, here is (roughly) the code for the \n`arx_forecaster()` to predict the death rate, 1 week ahead:\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-11_d45e3276831a1b40877b2251297fbb9d'}\n\n```{.r .cell-code}\nr <- epi_recipe(jhu) %>%\n step_epi_ahead(death_rate, ahead = 7) %>%\n step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%\n step_epi_naomit()\n\nlatest <- get_test_data(r, jhu)\n\nf <- frosting() %>%\n layer_predict() %>%\n layer_residual_quantiles() %>%\n layer_add_forecast_date() %>%\n layer_add_target_date() %>%\n layer_threshold(starts_with(\".pred\"))\n\neng <- linear_reg()\nwf <- epi_workflow(r, eng, f) %>% fit(jhu)\npreds <- predict(wf, latest)\n```\n:::\n\nThe code for `arx_forecaster()` simply generalizes this, passing along arguments as needed.\n\n\n::: {.cell layout-align=\"center\" hash='forecast-framework_cache/html/unnamed-chunk-12_b7f75d610d9c4f0ced30040e9aa3a481'}\n\n```{.r .cell-code}\npreds\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 56 x 6 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25\n#> \n#> # A tibble: 56 × 6\n#> geo_value time_value .pred .pred_distn forecast_date target_date\n#> * \n#> 1 ak 2021-12-31 0.355 quantiles(0.36)[2] 2021-12-31 2022-01-07 \n#> 2 al 2021-12-31 0.325 quantiles(0.32)[2] 2021-12-31 2022-01-07 \n#> 3 ar 2021-12-31 0.496 quantiles(0.5)[2] 2021-12-31 2022-01-07 \n#> 4 as 2021-12-31 0.0836 quantiles(0.2)[2] 2021-12-31 2022-01-07 \n#> 5 az 2021-12-31 0.614 quantiles(0.61)[2] 2021-12-31 2022-01-07 \n#> 6 ca 2021-12-31 0.327 quantiles(0.33)[2] 2021-12-31 2022-01-07 \n#> # ℹ 50 more rows\n```\n:::\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/growth-rates/execute-results/html.json b/_freeze/growth-rates/execute-results/html.json index 0b4c3b9..dffd019 100644 --- a/_freeze/growth-rates/execute-results/html.json +++ b/_freeze/growth-rates/execute-results/html.json @@ -1,7 +1,7 @@ { "hash": "c7a31477c0043e7cfa15a1c2b8f198ce", "result": { - "markdown": "---\ntitle: Estimate growth rates in signals\n---\n\n\nA basic way of assessing growth in a signal is to look at its relative change\nover two neighboring time windows. The `epiprocess` package provides a function\n`growth_rate()` to compute such relative changes, as well as more sophisticated \nestimates the growth rate of a signal. We investigate this functionality in the\ncurrent vignette, applied to state-level daily reported COVID-19 cases from GA\nand PA, smoothed using a 7-day trailing average.\n\n\n\n\n\nThe example we'll examine uses built in data containing confirmed COVID-19 cases and deaths based on reports made available by the Center for Systems Science and Engineering at Johns Hopkins University. This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to California, Florida, Texas, New York, Georgia, and Pennsylvania, though, we'll use only part of that here.\n\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-2_90212a671d4734416d16fd0084bb2126'}\n\n:::\n\n\n## Growth rate basics\n\nThe growth rate of a function $f$ defined over a continuously-valued parameter\n$t$ is defined as $f'(t)/f(t)$, where $f'(t)$ is the derivative of $f$ at\n$t$. To estimate the growth rate of a signal in discrete-time (which can be\nthought of as evaluations or discretizations of an underlying function in\ncontinuous-time), we can estimate the derivative and divide by the signal value\nitself (or possibly a smoothed version of the signal value). \n\nThe `growth_rate()` function takes a sequence of underlying design points `x`\nand corresponding sequence `y` of signal values, and allows us to choose from\nthe following methods for estimating the growth rate at a given reference point\n`x0`, by setting the `method` argument: \n\n* \"rel_change\": uses $(\\bar B/\\bar A - 1) / h$, where $\\bar B$ is the average of\n `y` over the second half of a sliding window of bandwidth `h` centered at the\n reference point `x0`, and $\\bar A$ the average over the first half. This can \n be seen as using a first-difference approximation to the derivative.\n* \"linear_reg\": uses the slope from a linear regression of `y` on `x` over a\n sliding window centered at the reference point `x0`, divided by the fitted\n value from this linear regression at `x0`.\n* \"smooth_spline\": uses the estimated derivative at `x0` from a smoothing spline\n fit to `x` and `y`, via `stats::smooth.spline()`, divided by the fitted value\n of the spline at `x0`.\n* \"trend_filter\": uses the estimated derivative at `x0` from polynomial trend\n filtering (a discrete spline) fit to `x` and `y`, via\n `genlasso::trendfilter()`, divided by the fitted value of the discrete spline\n at `x0`.\n\nThe default in `growth_rate()` is `x0 = x`, so that it returns an estimate of\nthe growth rate at each underlying design point. \n\n## Relative change\n\nThe default method is \"rel_change\", which is the simplest way to estimate growth\nrates. The default bandwidth is `h = 7`, which for daily data, considers the\nrelative change in a signal over adjacent weeks. We can wrap `growth_rate()` in\na call to `dplyr::mutate()` to append a new column to our `epi_df` object with\nthe computed growth rates.\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-3_c0f1a99e6a348f43892713c07255e1bd'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(cases_gr1 = growth_rate(time_value, cases))\n\nx\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,158 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07.044666\n#> \n#> # A tibble: 1,158 × 4\n#> # Groups: geo_value [2]\n#> geo_value time_value cases cases_gr1\n#> * \n#> 1 ga 2020-06-01 643. 0.00601\n#> 2 ga 2020-06-02 603. 0.0185 \n#> 3 ga 2020-06-03 608 0.0240 \n#> 4 ga 2020-06-04 656. 0.0218 \n#> 5 ga 2020-06-05 677. 0.0193 \n#> 6 ga 2020-06-06 718. 0.0163 \n#> # ℹ 1,152 more rows\n```\n:::\n:::\n\n\nWe can visualize these growth rate estimates by plotting the signal values and\nhighlighting the periods in time for which the relative change is above 1% (in\nred) and below -1% (in blue), faceting by geo value. \n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-4_8c243fea3b439cbd9f00e42ee6df1596'}\n\n```{.r .cell-code code-fold=\"true\"}\nupper <- 0.01\nlower <- -0.01\n\nggplot(x, aes(x = time_value, y = cases)) +\n geom_tile(\n data = x %>% filter(cases_gr1 >= upper),\n aes(x = time_value, y = 0, width = 7, height = Inf),\n fill = 2, alpha = 0.08\n ) +\n geom_tile(\n data = x %>% filter(cases_gr1 <= lower),\n aes(x = time_value, y = 0, width = 7, height = Inf),\n fill = 4, alpha = 0.08\n ) +\n geom_line() +\n facet_wrap(vars(geo_value), scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-4-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAs a more direct visualization, we plot the estimated growth rates themselves,\noverlaying the curves for the two states on one plot. \n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-5_8afaf2ba22e39074da0d9093aad80bc6'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases_gr1)) +\n geom_line(aes(col = geo_value)) +\n geom_hline(yintercept = upper, linetype = 2, col = 2) +\n geom_hline(yintercept = lower, linetype = 2, col = 4) +\n scale_color_manual(values = c(3, 6)) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"State\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-5-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see that the estimated growth rates from the relative change method are\nsomewhat volatile, and there appears to be some bias towards towards the right\nboundary of the time span---look at the estimated growth rate for Georgia in\nlate December 2021, which takes a potentially suspicious dip. In general,\nestimation of derivatives will be difficult near the boundary, but relative\nchanges can suffer from particularly noticeable boundary bias because they are\nbased on a difference in averages over two halves of a local window, and with\nthis simplistic approach, one of these halves will be truncated near a boundary.\n\n## Linear regression\n\nThe second simplest method available is \"linear_reg\", whose default bandwidth is\nagain `h = 7`. Compared to \"rel_change\", it appears to behave similarly overall,\nbut thankfully avoids some of the troublesome spikes:\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-6_5f69ee2208e7ab381927baff14c5ee36'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(cases_gr2 = growth_rate(time_value, cases, method = \"linear_reg\"))\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-7_714913b51436803d947772a84bd89897'}\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(\n method = recode(method,\n cases_gr1 = \"rel_change\",\n cases_gr2 = \"linear_reg\"\n )\n ) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_hline(yintercept = 0) +\n geom_line(aes(col = method)) +\n scale_color_manual(values = c(2, 4)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-7-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Nonparametric estimation\n\nWe can also use a nonparametric method to estimate the derivative, through\n\"smooth_spline\" or \"trend_filter\". The latter is going to be generally more\ncomputationally expensive, but it is also able to adapt better to the local\nlevel of smoothness. (The apparent efficiency is actually compounded by the\nparticular implementations and default settings for these methods:\n\"trend_filter\" is based on a full solution path algorithm provided in the\n`genlasso` package, and performs cross-validation by default in order to pick\nthe level of regularization; read the documentation for `growth_rate()` more\ndetails.) \n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-8_dba4540f8baa396296f1252aeeb08790'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(\n cases_gr3 = growth_rate(time_value, cases, method = \"smooth_spline\"),\n cases_gr4 = growth_rate(time_value, cases, method = \"trend_filter\")\n )\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-9_26f30e9ef6062725bda13985c480fdd7'}\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n select(geo_value, time_value, cases_gr3, cases_gr4) %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(method = recode(method,\n cases_gr3 = \"smooth_spline\",\n cases_gr4 = \"trend_filter\"\n )) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_hline(yintercept = 0) +\n geom_line(aes(col = method)) +\n scale_color_manual(values = c(3, 6)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nIn this particular example, the trend filtering estimates of growth rate appear \nto be much more stable than those from the smoothing spline, and also much more \nstable than the estimates from local relative changes and linear regressions.\n\nThe smoothing spline growth rate estimates are based on the default settings in\n`stats::smooth.spline()`, and appear severely under-regularized here. Any of the\narguments to `stats::smooth.spline()` can be customized by passing them as\nadditional arguments `...` in the call to `growth_rate()`; similarly, we can\nalso use additional arguments to customize the settings in the underlying trend\nfiltering functions `genlasso::trendfilter()`, `genlasso::cv.trendfilter()`, and\nthe documentation for `growth_rate()` gives the full details.\n\n## Log scale estimation\n\nIn general, and alternative view for the growth rate of a function $f$ is given\nby defining $g(t) = \\log(f(t))$, and then observing that $g'(t) = f'(t)/f(t)$.\nTherefore, any method that estimates the derivative can be simply applied to the\nlog of the signal of interest, and in this light, each method above \n(\"rel_change\", \"linear_reg\", \"smooth_spline\", and \"trend_filter\") has a log\nscale analog, which can be used by setting the argument `log_scale = TRUE` in\nthe call to `growth_rate()`.\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-10_8010b6d42bffdd224e07ee7645ccaabd'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(\n cases_gr5 = growth_rate(time_value, cases,\n method = \"rel_change\",\n log_scale = TRUE\n ),\n cases_gr6 = growth_rate(time_value, cases,\n method = \"linear_reg\",\n log_scale = TRUE\n ),\n cases_gr7 = growth_rate(time_value, cases,\n method = \"smooth_spline\",\n log_scale = TRUE\n ),\n cases_gr8 = growth_rate(time_value, cases,\n method = \"trend_filter\",\n log_scale = TRUE\n )\n )\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-11_b6204de22436262c14a6fa081fa7e9c5'}\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n select(geo_value, time_value, cases_gr5, cases_gr6) %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(method = recode(method,\n cases_gr5 = \"rel_change_log\",\n cases_gr6 = \"linear_reg_log\"\n )) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_line(aes(col = method)) +\n geom_hline(yintercept = 0) +\n scale_color_manual(values = c(2, 4)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-11-1.svg){fig-align='center' width=90%}\n:::\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n select(geo_value, time_value, cases_gr7, cases_gr8) %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(method = recode(method,\n cases_gr7 = \"smooth_spline_log\",\n cases_gr8 = \"trend_filter_log\"\n )) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_hline(yintercept = 0) +\n geom_line(aes(col = method)) +\n scale_color_manual(values = c(3, 6)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-11-2.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nComparing the `rel_change_log` curves with their `rel_change` counterparts \n(shown in earlier figures), we see that the former curves appear less volatile \nand match the linear regression estimates much more closely. In particular, when\n`rel_change` has upward spikes, `rel_change_log` has less pronounced spikes.\nWhy does this occur? The estimate of $g'(t)$ here can be expressed as $\\mathbb\nE[\\log(B)-\\log(A)]/h = \\mathbb E[\\log(1+hR)]/h$, where $R = ((B-A)/h) / A$, and \nthe expectation refers to averaging over the $h$ observations in each window.\nConsider the following two relevant inequalities, both due to concavity of the\nlogarithm function: \n\n$$\n\\mathbb E[\\log(1+hR)]/h \\leq \\log(1+h\\mathbb E[R])/h \\leq \\mathbb E[R].\n$$\n\nThe first inequality is Jensen's; the second inequality is because the tangent \nline of a concave function lies above it. Finally, we observe that $\\mathbb\nE[R] \\approx ((\\bar B-\\bar A)/h) / \\bar A$, which the `rel_change` estimate. \nThis explains why the `rel_change_log` curve often lies below the `rel_change`\ncurve.\n", + "markdown": "---\ntitle: Estimate growth rates in signals\n---\n\n\nA basic way of assessing growth in a signal is to look at its relative change\nover two neighboring time windows. The `epiprocess` package provides a function\n`growth_rate()` to compute such relative changes, as well as more sophisticated \nestimates the growth rate of a signal. We investigate this functionality in the\ncurrent vignette, applied to state-level daily reported COVID-19 cases from GA\nand PA, smoothed using a 7-day trailing average.\n\n\n\n\n\nThe example we'll examine uses built in data containing confirmed COVID-19 cases and deaths based on reports made available by the Center for Systems Science and Engineering at Johns Hopkins University. This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to California, Florida, Texas, New York, Georgia, and Pennsylvania, though, we'll use only part of that here.\n\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-2_90212a671d4734416d16fd0084bb2126'}\n\n:::\n\n\n## Growth rate basics\n\nThe growth rate of a function $f$ defined over a continuously-valued parameter\n$t$ is defined as $f'(t)/f(t)$, where $f'(t)$ is the derivative of $f$ at\n$t$. To estimate the growth rate of a signal in discrete-time (which can be\nthought of as evaluations or discretizations of an underlying function in\ncontinuous-time), we can estimate the derivative and divide by the signal value\nitself (or possibly a smoothed version of the signal value). \n\nThe `growth_rate()` function takes a sequence of underlying design points `x`\nand corresponding sequence `y` of signal values, and allows us to choose from\nthe following methods for estimating the growth rate at a given reference point\n`x0`, by setting the `method` argument: \n\n* \"rel_change\": uses $(\\bar B/\\bar A - 1) / h$, where $\\bar B$ is the average of\n `y` over the second half of a sliding window of bandwidth `h` centered at the\n reference point `x0`, and $\\bar A$ the average over the first half. This can \n be seen as using a first-difference approximation to the derivative.\n* \"linear_reg\": uses the slope from a linear regression of `y` on `x` over a\n sliding window centered at the reference point `x0`, divided by the fitted\n value from this linear regression at `x0`.\n* \"smooth_spline\": uses the estimated derivative at `x0` from a smoothing spline\n fit to `x` and `y`, via `stats::smooth.spline()`, divided by the fitted value\n of the spline at `x0`.\n* \"trend_filter\": uses the estimated derivative at `x0` from polynomial trend\n filtering (a discrete spline) fit to `x` and `y`, via\n `genlasso::trendfilter()`, divided by the fitted value of the discrete spline\n at `x0`.\n\nThe default in `growth_rate()` is `x0 = x`, so that it returns an estimate of\nthe growth rate at each underlying design point. \n\n## Relative change\n\nThe default method is \"rel_change\", which is the simplest way to estimate growth\nrates. The default bandwidth is `h = 7`, which for daily data, considers the\nrelative change in a signal over adjacent weeks. We can wrap `growth_rate()` in\na call to `dplyr::mutate()` to append a new column to our `epi_df` object with\nthe computed growth rates.\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-3_c0f1a99e6a348f43892713c07255e1bd'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(cases_gr1 = growth_rate(time_value, cases))\n\nx\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,158 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07\n#> \n#> # A tibble: 1,158 × 4\n#> # Groups: geo_value [2]\n#> geo_value time_value cases cases_gr1\n#> * \n#> 1 ga 2020-06-01 643. 0.00601\n#> 2 ga 2020-06-02 603. 0.0185 \n#> 3 ga 2020-06-03 608 0.0240 \n#> 4 ga 2020-06-04 656. 0.0218 \n#> 5 ga 2020-06-05 677. 0.0193 \n#> 6 ga 2020-06-06 718. 0.0163 \n#> # ℹ 1,152 more rows\n```\n:::\n:::\n\n\nWe can visualize these growth rate estimates by plotting the signal values and\nhighlighting the periods in time for which the relative change is above 1% (in\nred) and below -1% (in blue), faceting by geo value. \n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-4_8c243fea3b439cbd9f00e42ee6df1596'}\n\n```{.r .cell-code code-fold=\"true\"}\nupper <- 0.01\nlower <- -0.01\n\nggplot(x, aes(x = time_value, y = cases)) +\n geom_tile(\n data = x %>% filter(cases_gr1 >= upper),\n aes(x = time_value, y = 0, width = 7, height = Inf),\n fill = 2, alpha = 0.08\n ) +\n geom_tile(\n data = x %>% filter(cases_gr1 <= lower),\n aes(x = time_value, y = 0, width = 7, height = Inf),\n fill = 4, alpha = 0.08\n ) +\n geom_line() +\n facet_wrap(vars(geo_value), scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-4-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAs a more direct visualization, we plot the estimated growth rates themselves,\noverlaying the curves for the two states on one plot. \n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-5_8afaf2ba22e39074da0d9093aad80bc6'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases_gr1)) +\n geom_line(aes(col = geo_value)) +\n geom_hline(yintercept = upper, linetype = 2, col = 2) +\n geom_hline(yintercept = lower, linetype = 2, col = 4) +\n scale_color_manual(values = c(3, 6)) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"State\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-5-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see that the estimated growth rates from the relative change method are\nsomewhat volatile, and there appears to be some bias towards towards the right\nboundary of the time span---look at the estimated growth rate for Georgia in\nlate December 2021, which takes a potentially suspicious dip. In general,\nestimation of derivatives will be difficult near the boundary, but relative\nchanges can suffer from particularly noticeable boundary bias because they are\nbased on a difference in averages over two halves of a local window, and with\nthis simplistic approach, one of these halves will be truncated near a boundary.\n\n## Linear regression\n\nThe second simplest method available is \"linear_reg\", whose default bandwidth is\nagain `h = 7`. Compared to \"rel_change\", it appears to behave similarly overall,\nbut thankfully avoids some of the troublesome spikes:\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-6_5f69ee2208e7ab381927baff14c5ee36'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(cases_gr2 = growth_rate(time_value, cases, method = \"linear_reg\"))\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-7_714913b51436803d947772a84bd89897'}\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(\n method = recode(method,\n cases_gr1 = \"rel_change\",\n cases_gr2 = \"linear_reg\"\n )\n ) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_hline(yintercept = 0) +\n geom_line(aes(col = method)) +\n scale_color_manual(values = c(2, 4)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-7-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Nonparametric estimation\n\nWe can also use a nonparametric method to estimate the derivative, through\n\"smooth_spline\" or \"trend_filter\". The latter is going to be generally more\ncomputationally expensive, but it is also able to adapt better to the local\nlevel of smoothness. (The apparent efficiency is actually compounded by the\nparticular implementations and default settings for these methods:\n\"trend_filter\" is based on a full solution path algorithm provided in the\n`genlasso` package, and performs cross-validation by default in order to pick\nthe level of regularization; read the documentation for `growth_rate()` more\ndetails.) \n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-8_dba4540f8baa396296f1252aeeb08790'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(\n cases_gr3 = growth_rate(time_value, cases, method = \"smooth_spline\"),\n cases_gr4 = growth_rate(time_value, cases, method = \"trend_filter\")\n )\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-9_26f30e9ef6062725bda13985c480fdd7'}\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n select(geo_value, time_value, cases_gr3, cases_gr4) %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(method = recode(method,\n cases_gr3 = \"smooth_spline\",\n cases_gr4 = \"trend_filter\"\n )) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_hline(yintercept = 0) +\n geom_line(aes(col = method)) +\n scale_color_manual(values = c(3, 6)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nIn this particular example, the trend filtering estimates of growth rate appear \nto be much more stable than those from the smoothing spline, and also much more \nstable than the estimates from local relative changes and linear regressions.\n\nThe smoothing spline growth rate estimates are based on the default settings in\n`stats::smooth.spline()`, and appear severely under-regularized here. Any of the\narguments to `stats::smooth.spline()` can be customized by passing them as\nadditional arguments `...` in the call to `growth_rate()`; similarly, we can\nalso use additional arguments to customize the settings in the underlying trend\nfiltering functions `genlasso::trendfilter()`, `genlasso::cv.trendfilter()`, and\nthe documentation for `growth_rate()` gives the full details.\n\n## Log scale estimation\n\nIn general, and alternative view for the growth rate of a function $f$ is given\nby defining $g(t) = \\log(f(t))$, and then observing that $g'(t) = f'(t)/f(t)$.\nTherefore, any method that estimates the derivative can be simply applied to the\nlog of the signal of interest, and in this light, each method above \n(\"rel_change\", \"linear_reg\", \"smooth_spline\", and \"trend_filter\") has a log\nscale analog, which can be used by setting the argument `log_scale = TRUE` in\nthe call to `growth_rate()`.\n\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-10_8010b6d42bffdd224e07ee7645ccaabd'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(\n cases_gr5 = growth_rate(time_value, cases,\n method = \"rel_change\",\n log_scale = TRUE\n ),\n cases_gr6 = growth_rate(time_value, cases,\n method = \"linear_reg\",\n log_scale = TRUE\n ),\n cases_gr7 = growth_rate(time_value, cases,\n method = \"smooth_spline\",\n log_scale = TRUE\n ),\n cases_gr8 = growth_rate(time_value, cases,\n method = \"trend_filter\",\n log_scale = TRUE\n )\n )\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='growth-rates_cache/html/unnamed-chunk-11_b6204de22436262c14a6fa081fa7e9c5'}\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n select(geo_value, time_value, cases_gr5, cases_gr6) %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(method = recode(method,\n cases_gr5 = \"rel_change_log\",\n cases_gr6 = \"linear_reg_log\"\n )) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_line(aes(col = method)) +\n geom_hline(yintercept = 0) +\n scale_color_manual(values = c(2, 4)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-11-1.svg){fig-align='center' width=90%}\n:::\n\n```{.r .cell-code code-fold=\"true\"}\nx %>%\n select(geo_value, time_value, cases_gr7, cases_gr8) %>%\n pivot_longer(\n cols = starts_with(\"cases_gr\"),\n names_to = \"method\",\n values_to = \"gr\"\n ) %>%\n mutate(method = recode(method,\n cases_gr7 = \"smooth_spline_log\",\n cases_gr8 = \"trend_filter_log\"\n )) %>%\n ggplot(aes(x = time_value, y = gr)) +\n geom_hline(yintercept = 0) +\n geom_line(aes(col = method)) +\n scale_color_manual(values = c(3, 6)) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Growth rate\", col = \"Method\")\n```\n\n::: {.cell-output-display}\n![](growth-rates_files/figure-html/unnamed-chunk-11-2.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nComparing the `rel_change_log` curves with their `rel_change` counterparts \n(shown in earlier figures), we see that the former curves appear less volatile \nand match the linear regression estimates much more closely. In particular, when\n`rel_change` has upward spikes, `rel_change_log` has less pronounced spikes.\nWhy does this occur? The estimate of $g'(t)$ here can be expressed as $\\mathbb\nE[\\log(B)-\\log(A)]/h = \\mathbb E[\\log(1+hR)]/h$, where $R = ((B-A)/h) / A$, and \nthe expectation refers to averaging over the $h$ observations in each window.\nConsider the following two relevant inequalities, both due to concavity of the\nlogarithm function: \n\n$$\n\\mathbb E[\\log(1+hR)]/h \\leq \\log(1+h\\mathbb E[R])/h \\leq \\mathbb E[R].\n$$\n\nThe first inequality is Jensen's; the second inequality is because the tangent \nline of a concave function lies above it. Finally, we observe that $\\mathbb\nE[R] \\approx ((\\bar B-\\bar A)/h) / \\bar A$, which the `rel_change` estimate. \nThis explains why the `rel_change_log` curve often lies below the `rel_change`\ncurve.\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/index/execute-results/html.json b/_freeze/index/execute-results/html.json index 5d7b35a..4cc6c85 100644 --- a/_freeze/index/execute-results/html.json +++ b/_freeze/index/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "b63e0142e33c157a71698ac19debb89a", + "hash": "581a82e7d09ad8a4454b6d07d65c6e0c", "result": { - "markdown": "---\ntoc-depth: 2\nnocite: |\n @*\n---\n\n\n# Preface {.unnumbered}\n\n\n::: {.cell}\n\n:::\n\n\n::: {.callout-caution}\nThis book is still under construction and may not yet be fully self-contained or reproducible. But it hopefully will be!\n:::\n\nThis book describes some of the functionality of the\n`{epiprocess}` and `{epipredict}` R packages, with an eye toward creating various types of signal processing and forecast creation for epidemiological data. The goal is to be able to load, inspect, process, and forecast\n --- using simple baselines to more elaborate customizations. \n\n## Installation {#sec-installation}\n\n\n\nThe following commands install the latest versions of the packages we use in this book:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-2_ef21555fe232decfa5c5e6ce81cfc532'}\n\n```{.r .cell-code}\n# install.packages(\"pak\")\n\n# Install our packages from GitHub:\npak::pkg_install(\"cmu-delphi/epidatr\")\npak::pkg_install(\"cmu-delphi/epiprocess\")\npak::pkg_install(\"cmu-delphi/epipredict\")\npak::pkg_install(\"cmu-delphi/epidatasets\")\n# Other model-fitting packages we use in this book (via epipredict):\npak::pkg_install(\"poissonreg\")\npak::pkg_install(\"ranger\")\npak::pkg_install(\"xgboost\")\n# Other data processing, model evaluation, example data, and other packages we\n# use in this book:\npak::pkg_install(\"RcppRoll\")\npak::pkg_install(\"tidyverse\")\npak::pkg_install(\"tidymodels\")\npak::pkg_install(\"broom\")\npak::pkg_install(\"performance\")\npak::pkg_install(\"modeldata\")\npak::pkg_install(\"see\")\npak::pkg_install(\"sessioninfo\")\n```\n:::\n\n\nMuch of the data used for illustration can be loaded directly from [Delphi's Epidata API](https://cmu-delphi.github.io/delphi-epidata/) which is built and maintained by the Carnegie Mellon University [Delphi research group](https://delphi.cmu.edu/). We have tried to provide most of the data used in these examples in a separate package, `{epidatasets}`, but it can also be accessed using `{epidatr}`, an R interface to the API and the successor to [`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/). These are also available from GitHub:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-3_6a7154f9225b9bd3d6ffa321b2da25af'}\n\n```{.r .cell-code}\npak::pkg_install(\"cmu-delphi/epidatasets\")\npak::pkg_install(\"cmu-delphi/epidatr\")\n```\n:::\n\n\n\n
Encountering installation issues? Click here to show some potential solutions. \n\n### Linux installation issues: compilation errors or slowness\n\nIf you are using Linux and encounter any compilation errors above, or if\ncompilation is taking very long, you might try using the RStudio (now called\nPosit) Package Manager to install binaries. You can try running this command\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-4_1c0021dec1f71a9fdbdd1a577b67f72a'}\n\n```{.r .cell-code}\noptions(\n repos = c(\n # contains binaries for Linux:\n RSPM = \"https://packagemanager.rstudio.com/all/latest\",\n # backup CRAN mirror of your choice:\n CRAN = \"https://cran.rstudio.com/\"\n )\n)\n```\n:::\n\n\n### Reproducibility\n\nThe above commands will give you the current versions of the packages used in\nthis book. If you're having trouble reproducing some of the results, it may be\ndue to package updates that took place after the book was last updated. To match\nthe versions we used to generate this book, you can use the steps below.\n\n#### First: set up and store a GitHub PAT\n\nIf you don't already have a GitHub PAT, you can use the following helper functions to create one:\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-5_8e6231bc239928b163de90b8ac90ad95'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(\"usethis\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Installing usethis [2.2.0] ...\n#> \tOK [linked cache in 0.22 milliseconds]\n#> * Installed 1 package in 3.1 seconds.\n```\n:::\n\n```{.r .cell-code}\nusethis::create_github_token(\n scopes = \"public_repo\",\n description = \"For public repo access\"\n)\n```\n:::\n\nThis will open a web browser window allowing you to describe and customize\nsettings of the PAT. Scroll to the bottom and click \"Generate\ntoken\". You'll see a screen that has `ghp_` with a green background; you can click the two-squares (\"copy\") icon to copy this `ghp_......` string to the clipboard.\n\n#### Either A: Download and use the `renv.lock`\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-6_298fc2b073cbedc2c6a201948f33aed0'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(c(\"renv\", \"gitcreds\"))\ndownload.file(\"https://raw.githubusercontent.com/cmu-delphi/delphi-tooling-book/main/renv.lock\", \"delphi-tooling-book.renv.lock\")\n\n# Run this in a fresh session each time you'd like to use this set of versions.\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::use(lockfile = \"delphi-tooling-book.renv.lock\")\n# If you get 401 errors, you may need to regenerate your GitHub PAT or check if\n# `gitcreds::gitcreds_get()` is detecting an old PAT you have saved somewhere.\n```\n:::\n\n\n#### Or B: Download the book and use its `.Rprofile`\n\n1. Download the book [here](https://github.com/cmu-delphi/delphi-tooling-book/archive/refs/heads/main.zip) and unzip it.\n2. One-time setup: launch R inside the delphi-tooling-book directory (to use its\n `.Rprofile` file) and run\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-7_5f193d921c8c0c99eb8e67696fb70a8b'}\n\n```{.r .cell-code}\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::restore() # downloads the appropriate package versions\n```\n:::\n\n\n3. To use this set of versions: launch R inside the delphi-tooling-book directory.\n\n### Other issues\n\nPlease let us know! You can file an issue with the book [here](https://github.com/cmu-delphi/delphi-tooling-book/issues), or with one of the individual packages at their own issue pages: [epidatr](https://github.com/cmu-delphi/epidatr/issues), [epiprocess](https://github.com/cmu-delphi/epiprocess/issues), [epipredict](https://github.com/cmu-delphi/epipredict/issues).\n\n
\n\n\n## Documentation\n\nYou can view the complete documentation for these packages at \n\n* ,\n* ,\n* ,\n* .\n\n## Attribution\n\nThis document contains a number of datasets that are a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). These data are licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.\n\n[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): \n These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. \n\n\n\n## Quick-start example\n\nThese packages come with some built-in historical data for illustration, but\nup-to-date versions could be downloaded with the\n[`{epidatr}`](https://cmu-delphi.github.io/epidatr) or \n[`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/index.html) \npackages and processed using\n[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^index1]\n\n[^index1]: COVIDcast data and other epidemiological signals for non-Covid related illnesses are available with [`{epidatr}`](https://cmu-delphi.github.io/epidatr), which interfaces directly to Delphi's [Epidata API](https://cmu-delphi.github.io/delphi-epidata/).\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/epidf_48a293db285163cbde5b55b5d6115276'}\n\n```{.r .cell-code}\nlibrary(epipredict)\njhu <- case_death_rate_subset\njhu\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 20,496 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25.791826\n#> \n#> # A tibble: 20,496 × 4\n#> geo_value time_value case_rate death_rate\n#> * \n#> 1 ak 2020-12-31 35.9 0.158\n#> 2 al 2020-12-31 65.1 0.438\n#> 3 ar 2020-12-31 66.0 1.27 \n#> 4 as 2020-12-31 0 0 \n#> 5 az 2020-12-31 76.8 1.10 \n#> 6 ca 2020-12-31 96.0 0.751\n#> # ℹ 20,490 more rows\n```\n:::\n:::\n\n\nTo create and train a simple auto-regressive forecaster to predict the death rate two weeks into the future using past (lagged) deaths and cases, we could use the following function.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/make-forecasts_1d4f30fdea1cd893cb39553fa0f1d21c'}\n\n```{.r .cell-code}\ntwo_week_ahead <- arx_forecaster(\n jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n lags = list(case_rate = c(0, 1, 2, 3, 7, 14), death_rate = c(0, 7, 14)),\n ahead = 14\n )\n)\n```\n:::\n\n\nIn this case, we have used a number of different lags for the case rate, while only using 3 weekly lags for the death rate (as predictors). The result is both a fitted model object which could be used any time in the future to create different forecasts, as well as a set of predicted values (and prediction intervals) for each location 14 days after the last available time value in the data.\n\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/print-model_f5f6b4212b46903845381e0a40889efc'}\n\n```{.r .cell-code}\ntwo_week_ahead$epi_workflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Epi Workflow [trained] ═══════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> Postprocessor: Frosting\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 6 Recipe Steps\n#> \n#> • step_epi_lag()\n#> • step_epi_lag()\n#> • step_epi_ahead()\n#> • step_naomit()\n#> • step_naomit()\n#> • step_training_window()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) lag_0_case_rate lag_1_case_rate lag_2_case_rate \n#> -0.0073358 0.0030365 0.0012467 0.0009536 \n#> lag_3_case_rate lag_7_case_rate lag_14_case_rate lag_0_death_rate \n#> 0.0011425 0.0012481 0.0003041 0.1351769 \n#> lag_7_death_rate lag_14_death_rate \n#> 0.1471127 0.1062473 \n#> \n#> ── Postprocessor ────────────────────────────────────────────────────────────\n#> 5 Frosting Layers\n#> \n#> • layer_predict()\n#> • layer_residual_quantiles()\n#> • layer_add_forecast_date()\n#> • layer_add_target_date()\n#> • layer_threshold()\n```\n:::\n:::\n\n\nThe fitted model here involved preprocessing the data to appropriately generate lagged predictors, estimating a linear model with `stats::lm()` and then postprocessing the results to be meaningful for epidemiological tasks. We can also examine the predictions.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/show-preds_4bf0ca6ef427c01aa0a0686ab430f93d'}\n\n```{.r .cell-code}\ntwo_week_ahead$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.449 [0.05, 0.95] 2021-12-31 2022-01-14 \n#> 2 al 0.574 [0.05, 0.95] 2021-12-31 2022-01-14 \n#> 3 ar 0.673 [0.05, 0.95] 2021-12-31 2022-01-14 \n#> 4 as 0 [0.05, 0.95] 2021-12-31 2022-01-14 \n#> 5 az 0.679 [0.05, 0.95] 2021-12-31 2022-01-14 \n#> 6 ca 0.575 [0.05, 0.95] 2021-12-31 2022-01-14 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe results above show a distributional forecast produced using data through the end of 2021 for the 14th of January 2022. A prediction for the death rate per 100K inhabitants is available for every state (`geo_value`) along with a 90% predictive interval. The figure below\ndisplays the forecast for a small handful of states. The vertical black line is the forecast date. The forecast doesn't appear to be particularly good, but our choices above were intended to be illustrative of the functionality rather than optimized for accuracy.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-8_255707fa6f6471ab8acdc4aa70ca731f'}\n\n```{.r .cell-code code-fold=\"true\"}\nsamp_geos <- c(\"ca\", \"co\", \"ny\", \"pa\")\n\nhist <- jhu %>%\n filter(\n geo_value %in% samp_geos,\n time_value >= max(time_value) - 90L\n )\npreds <- two_week_ahead$predictions %>%\n filter(geo_value %in% samp_geos) %>%\n mutate(q = nested_quantiles(.pred_distn)) %>%\n unnest(q) %>%\n pivot_wider(names_from = tau, values_from = q)\n\nggplot(hist, aes(color = geo_value)) +\n geom_line(aes(time_value, death_rate)) +\n theme_bw() +\n geom_errorbar(data = preds, aes(x = target_date, ymin = `0.05`, ymax = `0.95`)) +\n geom_point(data = preds, aes(target_date, .pred)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d(name = \"\") +\n scale_x_date(date_labels = \"%b %Y\") +\n theme(legend.position = \"bottom\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\")\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Contents\n\nThe remainder of this book examines this software in more detail, illustrating some of the flexibility that is available.\n\n---\n\n
Session Information. \n\nSee also @sec-installation.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-9_c866f0d1d0a1809a33be44cd8b8eec3f'}\n\n```{.r .cell-code}\nsessioninfo::session_info()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ─ Session info ────────────────────────────────────────────────────────────\n#> setting value\n#> version R version 4.3.0 (2023-04-21)\n#> os macOS Ventura 13.4\n#> system aarch64, darwin20\n#> ui X11\n#> language (EN)\n#> collate en_US.UTF-8\n#> ctype en_US.UTF-8\n#> tz America/Vancouver\n#> date 2023-06-21\n#> pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)\n#> \n#> ─ Packages ────────────────────────────────────────────────────────────────\n#> ! package * version date (UTC) lib source\n#> P anytime 0.3.9 2020-08-27 [?] CRAN (R 4.3.0)\n#> P backports 1.4.1 2021-12-13 [?] CRAN (R 4.3.0)\n#> P checkmate 2.2.0 2023-04-27 [?] CRAN (R 4.3.0)\n#> P class 7.3-22 2023-05-03 [?] CRAN (R 4.3.0)\n#> P cli 3.6.1 2023-03-23 [?] CRAN (R 4.3.0)\n#> P codetools 0.2-19 2023-02-01 [?] CRAN (R 4.3.0)\n#> P colorspace 2.1-0 2023-01-23 [?] CRAN (R 4.3.0)\n#> P crayon 1.5.2 2022-09-29 [?] CRAN (R 4.3.0)\n#> P data.table 1.14.8 2023-02-17 [?] CRAN (R 4.3.0)\n#> P digest 0.6.31 2022-12-11 [?] CRAN (R 4.3.0)\n#> P distributional 0.3.2 2023-03-22 [?] CRAN (R 4.3.0)\n#> P dplyr * 1.1.2 2023-04-20 [?] CRAN (R 4.3.0)\n#> P ellipsis 0.3.2 2021-04-29 [?] CRAN (R 4.3.0)\n#> P epidatasets * 0.0.1 2023-06-17 [?] Github (cmu-delphi/epidatasets@cc8f2a0)\n#> P epidatr * 0.6.0 2023-06-16 [?] Github (cmu-delphi/epidatr@46d2d54)\n#> P epipredict * 0.0.5 2023-06-17 [?] Github (cmu-delphi/epipredict@206f0ef)\n#> P epiprocess * 0.6.0.9999 2023-06-16 [?] Github (cmu-delphi/epiprocess@572f6e6)\n#> P evaluate 0.21 2023-05-05 [?] CRAN (R 4.3.0)\n#> P fansi 1.0.4 2023-01-22 [?] CRAN (R 4.3.0)\n#> P farver 2.1.1 2022-07-06 [?] CRAN (R 4.3.0)\n#> P fastmap 1.1.1 2023-02-24 [?] CRAN (R 4.3.0)\n#> P forcats * 1.0.0 2023-01-29 [?] CRAN (R 4.3.0)\n#> P fs 1.6.2 2023-04-25 [?] CRAN (R 4.3.0)\n#> P future 1.32.0 2023-03-07 [?] CRAN (R 4.3.0)\n#> P future.apply 1.11.0 2023-05-21 [?] CRAN (R 4.3.0)\n#> P generics 0.1.3 2022-07-05 [?] CRAN (R 4.3.0)\n#> P ggplot2 * 3.4.2 2023-04-03 [?] CRAN (R 4.3.0)\n#> P globals 0.16.2 2022-11-21 [?] CRAN (R 4.3.0)\n#> P glue 1.6.2 2022-02-24 [?] CRAN (R 4.3.0)\n#> P gower 1.0.1 2022-12-22 [?] CRAN (R 4.3.0)\n#> P gtable 0.3.3 2023-03-21 [?] CRAN (R 4.3.0)\n#> P hardhat 1.3.0 2023-03-30 [?] CRAN (R 4.3.0)\n#> P hms 1.1.3 2023-03-21 [?] CRAN (R 4.3.0)\n#> P htmltools 0.5.5 2023-03-23 [?] CRAN (R 4.3.0)\n#> P httr 1.4.6 2023-05-08 [?] CRAN (R 4.3.0)\n#> P ipred 0.9-14 2023-03-09 [?] CRAN (R 4.3.0)\n#> P jsonlite 1.8.5 2023-06-05 [?] CRAN (R 4.3.0)\n#> P knitr 1.43 2023-05-25 [?] CRAN (R 4.3.0)\n#> P labeling 0.4.2 2020-10-20 [?] CRAN (R 4.3.0)\n#> P lattice 0.21-8 2023-04-05 [?] CRAN (R 4.3.0)\n#> P lava 1.7.2.1 2023-02-27 [?] CRAN (R 4.3.0)\n#> P lifecycle 1.0.3 2022-10-07 [?] CRAN (R 4.3.0)\n#> P listenv 0.9.0 2022-12-16 [?] CRAN (R 4.3.0)\n#> P lubridate * 1.9.2 2023-02-10 [?] CRAN (R 4.3.0)\n#> P magrittr 2.0.3 2022-03-30 [?] CRAN (R 4.3.0)\n#> P MASS 7.3-60 2023-05-04 [?] CRAN (R 4.3.0)\n#> P Matrix 1.5-4 2023-04-04 [?] CRAN (R 4.3.0)\n#> P MatrixModels 0.5-1 2022-09-11 [?] CRAN (R 4.3.0)\n#> P MMWRweek 0.1.3 2020-04-22 [?] CRAN (R 4.3.0)\n#> P munsell 0.5.0 2018-06-12 [?] CRAN (R 4.3.0)\n#> P nnet 7.3-19 2023-05-03 [?] CRAN (R 4.3.0)\n#> P parallelly 1.36.0 2023-05-26 [?] CRAN (R 4.3.0)\n#> P parsnip * 1.1.0 2023-04-12 [?] CRAN (R 4.3.0)\n#> P pillar 1.9.0 2023-03-22 [?] CRAN (R 4.3.0)\n#> P pkgconfig 2.0.3 2019-09-22 [?] CRAN (R 4.3.0)\n#> P prodlim 2023.03.31 2023-04-02 [?] CRAN (R 4.3.0)\n#> P purrr * 1.0.1 2023-01-10 [?] CRAN (R 4.3.0)\n#> P quantreg 5.95 2023-04-08 [?] CRAN (R 4.3.0)\n#> P R.cache 0.16.0 2022-07-21 [?] CRAN (R 4.3.0)\n#> P R.methodsS3 1.8.2 2022-06-13 [?] CRAN (R 4.3.0)\n#> P R.oo 1.25.0 2022-06-12 [?] CRAN (R 4.3.0)\n#> P R.utils 2.12.2 2022-11-11 [?] CRAN (R 4.3.0)\n#> P R6 2.5.1 2021-08-19 [?] CRAN (R 4.3.0)\n#> P Rcpp 1.0.10 2023-01-22 [?] CRAN (R 4.3.0)\n#> P readr * 2.1.4 2023-02-10 [?] CRAN (R 4.3.0)\n#> P recipes 1.0.6 2023-04-25 [?] CRAN (R 4.3.0)\n#> renv 0.17.3 2023-04-06 [1] CRAN (R 4.2.2)\n#> P rlang 1.1.1 2023-04-28 [?] CRAN (R 4.3.0)\n#> P rmarkdown 2.22 2023-06-01 [?] CRAN (R 4.3.0)\n#> P rpart 4.1.19 2022-10-21 [?] CRAN (R 4.3.0)\n#> P rstudioapi 0.14 2022-08-22 [?] CRAN (R 4.3.0)\n#> P scales 1.2.1 2022-08-20 [?] CRAN (R 4.3.0)\n#> P sessioninfo 1.2.2 2021-12-06 [?] CRAN (R 4.3.0)\n#> P SparseM 1.81 2021-02-18 [?] CRAN (R 4.3.0)\n#> P stringi 1.7.12 2023-01-11 [?] CRAN (R 4.3.0)\n#> P stringr * 1.5.0 2022-12-02 [?] CRAN (R 4.3.0)\n#> P styler 1.10.1 2023-06-05 [?] CRAN (R 4.3.0)\n#> P survival 3.5-5 2023-03-12 [?] CRAN (R 4.3.0)\n#> P tibble * 3.2.1 2023-03-20 [?] CRAN (R 4.3.0)\n#> P tidyr * 1.3.0 2023-01-24 [?] CRAN (R 4.3.0)\n#> P tidyselect 1.2.0 2022-10-10 [?] CRAN (R 4.3.0)\n#> P tidyverse * 2.0.0 2023-02-22 [?] CRAN (R 4.3.0)\n#> P timechange 0.2.0 2023-01-11 [?] CRAN (R 4.3.0)\n#> P timeDate 4022.108 2023-01-07 [?] CRAN (R 4.3.0)\n#> P tsibble 1.1.3 2022-10-09 [?] CRAN (R 4.3.0)\n#> P tzdb 0.4.0 2023-05-12 [?] CRAN (R 4.3.0)\n#> P usethis 2.2.0 2023-06-06 [?] CRAN (R 4.3.0)\n#> P utf8 1.2.3 2023-01-31 [?] CRAN (R 4.3.0)\n#> P vctrs 0.6.2 2023-04-19 [?] CRAN (R 4.3.0)\n#> P viridisLite 0.4.2 2023-05-02 [?] CRAN (R 4.3.0)\n#> P withr 2.5.0 2022-03-03 [?] CRAN (R 4.3.0)\n#> P workflows 1.1.3 2023-02-22 [?] CRAN (R 4.3.0)\n#> P xfun 0.39 2023-04-20 [?] CRAN (R 4.3.0)\n#> P xml2 1.3.4 2023-04-27 [?] CRAN (R 4.3.0)\n#> P yaml 2.3.7 2023-01-23 [?] CRAN (R 4.3.0)\n#> \n#> [1] /Users/dajmcdon/Library/Caches/org.R-project.R/R/renv/library/delphi-tooling-book-d37e2426/R-4.3/aarch64-apple-darwin20\n#> [2] /Users/dajmcdon/Library/Caches/org.R-project.R/R/renv/sandbox/R-4.3/aarch64-apple-darwin20/84ba8b13\n#> \n#> P ── Loaded and on-disk path mismatch.\n#> \n#> ───────────────────────────────────────────────────────────────────────────\n```\n:::\n:::\n\n\n
\n\n\n\n", + "markdown": "---\ntoc-depth: 2\nnocite: |\n @*\n---\n\n\n# Preface {.unnumbered}\n\n\n::: {.cell}\n\n:::\n\n\n::: {.callout-caution}\nThis book is still under construction and may not yet be fully self-contained or reproducible. But it hopefully will be!\n:::\n\nThis book describes some of the functionality of the\n`{epiprocess}` and `{epipredict}` R packages, with an eye toward creating various types of signal processing and forecast creation for epidemiological data. The goal is to be able to load, inspect, process, and forecast\n --- using simple baselines to more elaborate customizations. \n\n## Installation {#sec-installation}\n\n\n\nThe following commands install the latest versions of the packages we use in this book:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-2_ef21555fe232decfa5c5e6ce81cfc532'}\n\n```{.r .cell-code}\n# install.packages(\"pak\")\n\n# Install our packages from GitHub:\npak::pkg_install(\"cmu-delphi/epidatr\")\npak::pkg_install(\"cmu-delphi/epiprocess\")\npak::pkg_install(\"cmu-delphi/epipredict\")\npak::pkg_install(\"cmu-delphi/epidatasets\")\n# Other model-fitting packages we use in this book (via epipredict):\npak::pkg_install(\"poissonreg\")\npak::pkg_install(\"ranger\")\npak::pkg_install(\"xgboost\")\n# Other data processing, model evaluation, example data, and other packages we\n# use in this book:\npak::pkg_install(\"RcppRoll\")\npak::pkg_install(\"tidyverse\")\npak::pkg_install(\"tidymodels\")\npak::pkg_install(\"broom\")\npak::pkg_install(\"performance\")\npak::pkg_install(\"modeldata\")\npak::pkg_install(\"see\")\npak::pkg_install(\"sessioninfo\")\n```\n:::\n\n\nMuch of the data used for illustration can be loaded directly from [Delphi's Epidata API](https://cmu-delphi.github.io/delphi-epidata/) which is built and maintained by the Carnegie Mellon University [Delphi research group](https://delphi.cmu.edu/). We have tried to provide most of the data used in these examples in a separate package, `{epidatasets}`, but it can also be accessed using `{epidatr}`, an R interface to the API and the successor to [`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/). These are also available from GitHub:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-3_6a7154f9225b9bd3d6ffa321b2da25af'}\n\n```{.r .cell-code}\npak::pkg_install(\"cmu-delphi/epidatasets\")\npak::pkg_install(\"cmu-delphi/epidatr\")\n```\n:::\n\n\n\n
Encountering installation issues? Click here to show some potential solutions. \n\n### Linux installation issues: compilation errors or slowness\n\nIf you are using Linux and encounter any compilation errors above, or if\ncompilation is taking very long, you might try using the RStudio (now called\nPosit) Package Manager to install binaries. You can try running this command\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-4_1c0021dec1f71a9fdbdd1a577b67f72a'}\n\n```{.r .cell-code}\noptions(\n repos = c(\n # contains binaries for Linux:\n RSPM = \"https://packagemanager.rstudio.com/all/latest\",\n # backup CRAN mirror of your choice:\n CRAN = \"https://cran.rstudio.com/\"\n )\n)\n```\n:::\n\n\n### Reproducibility\n\nThe above commands will give you the current versions of the packages used in\nthis book. If you're having trouble reproducing some of the results, it may be\ndue to package updates that took place after the book was last updated. To match\nthe versions we used to generate this book, you can use the steps below.\n\n#### First: set up and store a GitHub PAT\n\nIf you don't already have a GitHub PAT, you can use the following helper functions to create one:\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-5_8e6231bc239928b163de90b8ac90ad95'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(\"usethis\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Installing usethis [2.2.2] ...\n#> \tOK [linked cache in 0.21 milliseconds]\n#> * Installed 1 package in 1.2 seconds.\n```\n:::\n\n```{.r .cell-code}\nusethis::create_github_token(\n scopes = \"public_repo\",\n description = \"For public repo access\"\n)\n```\n:::\n\nThis will open a web browser window allowing you to describe and customize\nsettings of the PAT. Scroll to the bottom and click \"Generate\ntoken\". You'll see a screen that has `ghp_` with a green background; you can click the two-squares (\"copy\") icon to copy this `ghp_......` string to the clipboard.\n\n#### Either A: Download and use the `renv.lock`\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-6_298fc2b073cbedc2c6a201948f33aed0'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(c(\"renv\", \"gitcreds\"))\ndownload.file(\"https://raw.githubusercontent.com/cmu-delphi/delphi-tooling-book/main/renv.lock\", \"delphi-tooling-book.renv.lock\")\n\n# Run this in a fresh session each time you'd like to use this set of versions.\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::use(lockfile = \"delphi-tooling-book.renv.lock\")\n# If you get 401 errors, you may need to regenerate your GitHub PAT or check if\n# `gitcreds::gitcreds_get()` is detecting an old PAT you have saved somewhere.\n```\n:::\n\n\n#### Or B: Download the book and use its `.Rprofile`\n\n1. Download the book [here](https://github.com/cmu-delphi/delphi-tooling-book/archive/refs/heads/main.zip) and unzip it.\n2. One-time setup: launch R inside the delphi-tooling-book directory (to use its\n `.Rprofile` file) and run\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-7_5f193d921c8c0c99eb8e67696fb70a8b'}\n\n```{.r .cell-code}\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::restore() # downloads the appropriate package versions\n```\n:::\n\n\n3. To use this set of versions: launch R inside the delphi-tooling-book directory.\n\n### Other issues\n\nPlease let us know! You can file an issue with the book [here](https://github.com/cmu-delphi/delphi-tooling-book/issues), or with one of the individual packages at their own issue pages: [epidatr](https://github.com/cmu-delphi/epidatr/issues), [epiprocess](https://github.com/cmu-delphi/epiprocess/issues), [epipredict](https://github.com/cmu-delphi/epipredict/issues).\n\n
\n\n\n## Documentation\n\nYou can view the complete documentation for these packages at \n\n* ,\n* ,\n* ,\n* .\n\n## Attribution\n\nThis document contains a number of datasets that are a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). These data are licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.\n\n[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): \n These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. \n\n\n\n## Quick-start example\n\nThese packages come with some built-in historical data for illustration, but\nup-to-date versions could be downloaded with the\n[`{epidatr}`](https://cmu-delphi.github.io/epidatr) or \n[`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/index.html) \npackages and processed using\n[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^index1]\n\n[^index1]: COVIDcast data and other epidemiological signals for non-Covid related illnesses are available with [`{epidatr}`](https://cmu-delphi.github.io/epidatr), which interfaces directly to Delphi's [Epidata API](https://cmu-delphi.github.io/delphi-epidata/).\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/epidf_48a293db285163cbde5b55b5d6115276'}\n\n```{.r .cell-code}\nlibrary(epipredict)\njhu <- case_death_rate_subset\njhu\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 20,496 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25\n#> \n#> # A tibble: 20,496 × 4\n#> geo_value time_value case_rate death_rate\n#> * \n#> 1 ak 2020-12-31 35.9 0.158\n#> 2 al 2020-12-31 65.1 0.438\n#> 3 ar 2020-12-31 66.0 1.27 \n#> 4 as 2020-12-31 0 0 \n#> 5 az 2020-12-31 76.8 1.10 \n#> 6 ca 2020-12-31 96.0 0.751\n#> # ℹ 20,490 more rows\n```\n:::\n:::\n\n\nTo create and train a simple auto-regressive forecaster to predict the death rate two weeks into the future using past (lagged) deaths and cases, we could use the following function.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/make-forecasts_1d4f30fdea1cd893cb39553fa0f1d21c'}\n\n```{.r .cell-code}\ntwo_week_ahead <- arx_forecaster(\n jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n lags = list(case_rate = c(0, 1, 2, 3, 7, 14), death_rate = c(0, 7, 14)),\n ahead = 14\n )\n)\n```\n:::\n\n\nIn this case, we have used a number of different lags for the case rate, while only using 3 weekly lags for the death rate (as predictors). The result is both a fitted model object which could be used any time in the future to create different forecasts, as well as a set of predicted values (and prediction intervals) for each location 14 days after the last available time value in the data.\n\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/print-model_f5f6b4212b46903845381e0a40889efc'}\n\n```{.r .cell-code}\ntwo_week_ahead$epi_workflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Epi Workflow [trained] ═══════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> Postprocessor: Frosting\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 6 Recipe Steps\n#> \n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) lag_0_case_rate lag_1_case_rate lag_2_case_rate \n#> -0.0073358 0.0030365 0.0012467 0.0009536 \n#> lag_3_case_rate lag_7_case_rate lag_14_case_rate lag_0_death_rate \n#> 0.0011425 0.0012481 0.0003041 0.1351769 \n#> lag_7_death_rate lag_14_death_rate \n#> 0.1471127 0.1062473 \n#> \n#> ── Postprocessor ────────────────────────────────────────────────────────────\n#> 5 Frosting Layers\n```\n:::\n:::\n\n\nThe fitted model here involved preprocessing the data to appropriately generate lagged predictors, estimating a linear model with `stats::lm()` and then postprocessing the results to be meaningful for epidemiological tasks. We can also examine the predictions.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/show-preds_4bf0ca6ef427c01aa0a0686ab430f93d'}\n\n```{.r .cell-code}\ntwo_week_ahead$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.449 quantiles(0.45)[2] 2021-12-31 2022-01-14 \n#> 2 al 0.574 quantiles(0.57)[2] 2021-12-31 2022-01-14 \n#> 3 ar 0.673 quantiles(0.67)[2] 2021-12-31 2022-01-14 \n#> 4 as 0 quantiles(0.12)[2] 2021-12-31 2022-01-14 \n#> 5 az 0.679 quantiles(0.68)[2] 2021-12-31 2022-01-14 \n#> 6 ca 0.575 quantiles(0.57)[2] 2021-12-31 2022-01-14 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe results above show a distributional forecast produced using data through the end of 2021 for the 14th of January 2022. A prediction for the death rate per 100K inhabitants is available for every state (`geo_value`) along with a 90% predictive interval. The figure below\ndisplays the forecast for a small handful of states. The vertical black line is the forecast date. The forecast doesn't appear to be particularly good, but our choices above were intended to be illustrative of the functionality rather than optimized for accuracy.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-8_1204ca19d449319e90b2fd3763e63dfd'}\n\n```{.r .cell-code code-fold=\"true\"}\nsamp_geos <- c(\"ca\", \"co\", \"ny\", \"pa\")\n\nhist <- jhu %>%\n filter(\n geo_value %in% samp_geos,\n time_value >= max(time_value) - 90L\n )\n\npreds <- two_week_ahead$predictions %>%\n filter(geo_value %in% samp_geos) %>%\n pivot_quantiles_wider(.pred_distn)\n\nggplot(hist, aes(color = geo_value)) +\n geom_line(aes(time_value, death_rate)) +\n theme_bw() +\n geom_errorbar(data = preds, aes(x = target_date, ymin = `0.05`, ymax = `0.95`)) +\n geom_point(data = preds, aes(target_date, .pred)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d(name = \"\") +\n scale_x_date(date_labels = \"%b %Y\") +\n theme(legend.position = \"bottom\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\")\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Contents\n\nThe remainder of this book examines this software in more detail, illustrating some of the flexibility that is available.\n\n---\n\n
Session Information. \n\nSee also @sec-installation.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-9_c866f0d1d0a1809a33be44cd8b8eec3f'}\n\n```{.r .cell-code}\nsessioninfo::session_info()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ─ Session info ────────────────────────────────────────────────────────────\n#> setting value\n#> version R version 4.1.3 (2022-03-10)\n#> os Fedora Linux 36 (Workstation Edition)\n#> system x86_64, linux-gnu\n#> ui X11\n#> language (EN)\n#> collate en_US.UTF-8\n#> ctype en_US.UTF-8\n#> tz America/Los_Angeles\n#> date 2023-12-15\n#> pandoc 2.14.0.3 @ /usr/bin/ (via rmarkdown)\n#> \n#> ─ Packages ────────────────────────────────────────────────────────────────\n#> ! package * version date (UTC) lib source\n#> P anytime 0.3.9 2020-08-27 [?] RSPM (R 4.1.3)\n#> P askpass 1.1 2019-01-13 [?] CRAN (R 4.0.5)\n#> P backports 1.4.1 2021-12-13 [?] CRAN (R 4.0.5)\n#> P cachem 1.0.8 2023-05-01 [?] RSPM (R 4.1.3)\n#> P checkmate 2.2.0 2023-04-27 [?] RSPM (R 4.1.3)\n#> P class 7.3-22 2023-05-03 [?] CRAN (R 4.1.3)\n#> P cli 3.6.1 2023-03-23 [?] RSPM (R 4.1.3)\n#> P codetools 0.2-19 2023-02-01 [?] RSPM (R 4.1.3)\n#> P colorspace 2.1-0 2023-01-23 [?] RSPM (R 4.1.3)\n#> P crayon 1.5.2 2022-09-29 [?] RSPM\n#> P data.table 1.14.8 2023-02-17 [?] RSPM (R 4.1.3)\n#> P digest 0.6.31 2022-12-11 [?] RSPM (R 4.1.3)\n#> P distributional 0.3.2 2023-03-22 [?] RSPM (R 4.1.3)\n#> P dplyr * 1.1.2 2023-04-20 [?] RSPM (R 4.1.3)\n#> P ellipsis 0.3.2 2021-04-29 [?] CRAN (R 4.0.5)\n#> P epidatasets * 0.0.1 2023-06-20 [?] Github (cmu-delphi/epidatasets@cc8f2a0)\n#> P epidatr * 1.0.0.9000 2023-12-15 [?] Github (cmu-delphi/epidatr@6e9f899)\n#> P epipredict * 0.0.6 2023-11-08 [?] Github (cmu-delphi/epipredict@378577a)\n#> P epiprocess * 0.7.0.9999 2023-12-15 [?] Github (cmu-delphi/epiprocess@b444a3c)\n#> P evaluate 0.21 2023-05-05 [?] RSPM (R 4.1.3)\n#> P fansi 1.0.4 2023-01-22 [?] RSPM (R 4.1.3)\n#> P farver 2.1.1 2022-07-06 [?] RSPM (R 4.1.3)\n#> P fastmap 1.1.1 2023-02-24 [?] RSPM (R 4.1.3)\n#> P forcats * 1.0.0 2023-01-29 [?] RSPM\n#> P fs 1.6.2 2023-04-25 [?] RSPM (R 4.1.3)\n#> P future 1.32.0 2023-03-07 [?] RSPM\n#> P future.apply 1.11.0 2023-05-21 [?] RSPM\n#> P generics 0.1.3 2022-07-05 [?] RSPM (R 4.1.3)\n#> P ggplot2 * 3.4.2 2023-04-03 [?] RSPM (R 4.1.3)\n#> P globals 0.16.2 2022-11-21 [?] RSPM (R 4.1.3)\n#> P glue 1.6.2 2022-02-24 [?] CRAN (R 4.0.5)\n#> P gower 1.0.1 2022-12-22 [?] RSPM\n#> P gtable 0.3.3 2023-03-21 [?] RSPM (R 4.1.3)\n#> P hardhat 1.3.0 2023-03-30 [?] RSPM (R 4.1.3)\n#> P hms 1.1.3 2023-03-21 [?] RSPM\n#> P htmltools 0.5.5 2023-03-23 [?] RSPM (R 4.1.3)\n#> P httr 1.4.6 2023-05-08 [?] CRAN (R 4.1.3)\n#> P ipred 0.9-14 2023-03-09 [?] RSPM\n#> P jsonlite 1.8.5 2023-06-05 [?] RSPM (R 4.1.3)\n#> P knitr 1.43 2023-05-25 [?] RSPM (R 4.1.3)\n#> P labeling 0.4.2 2020-10-20 [?] CRAN (R 4.0.5)\n#> P lattice 0.21-8 2023-04-05 [?] RSPM (R 4.1.3)\n#> P lava 1.7.2.1 2023-02-27 [?] RSPM\n#> P lifecycle 1.0.3 2022-10-07 [?] RSPM (R 4.1.3)\n#> P listenv 0.9.0 2022-12-16 [?] RSPM\n#> P lubridate * 1.9.2 2023-02-10 [?] CRAN (R 4.1.3)\n#> P magrittr 2.0.3 2022-03-30 [?] CRAN (R 4.0.5)\n#> P MASS 7.3-60 2023-05-04 [?] RSPM (R 4.1.3)\n#> P Matrix 1.5-4 2023-04-04 [?] CRAN (R 4.1.3)\n#> P MatrixModels 0.5-1 2022-09-11 [?] RSPM (R 4.1.3)\n#> P MMWRweek 0.1.3 2020-04-22 [?] RSPM (R 4.1.3)\n#> P munsell 0.5.0 2018-06-12 [?] CRAN (R 4.0.5)\n#> P nnet 7.3-19 2023-05-03 [?] RSPM (R 4.1.3)\n#> P openssl 2.0.6 2023-03-09 [?] RSPM (R 4.1.3)\n#> P parallelly 1.36.0 2023-05-26 [?] RSPM\n#> P parsnip * 1.1.0 2023-04-12 [?] RSPM (R 4.1.3)\n#> P pillar 1.9.0 2023-03-22 [?] RSPM (R 4.1.3)\n#> P pkgconfig 2.0.3 2019-09-22 [?] CRAN (R 4.1.3)\n#> P prodlim 2023.03.31 2023-04-02 [?] RSPM\n#> P purrr * 1.0.1 2023-01-10 [?] RSPM (R 4.1.3)\n#> P quantreg 5.95 2023-04-08 [?] RSPM (R 4.1.3)\n#> P R.cache 0.16.0 2022-07-21 [?] RSPM (R 4.1.3)\n#> P R.methodsS3 1.8.2 2022-06-13 [?] RSPM (R 4.1.3)\n#> P R.oo 1.25.0 2022-06-12 [?] RSPM (R 4.1.3)\n#> P R.utils 2.12.2 2022-11-11 [?] RSPM (R 4.1.3)\n#> P R6 2.5.1 2021-08-19 [?] CRAN (R 4.0.5)\n#> P Rcpp 1.0.10 2023-01-22 [?] RSPM (R 4.1.3)\n#> P readr * 2.1.4 2023-02-10 [?] RSPM\n#> P recipes 1.0.6 2023-04-25 [?] RSPM (R 4.1.3)\n#> P renv 0.17.3 2023-04-06 [?] RSPM (R 4.1.3)\n#> P rlang 1.1.1 2023-04-28 [?] RSPM (R 4.1.3)\n#> P rmarkdown 2.22 2023-06-01 [?] RSPM (R 4.1.3)\n#> P rpart 4.1.19 2022-10-21 [?] RSPM (R 4.1.3)\n#> P rstudioapi 0.14 2022-08-22 [?] RSPM (R 4.1.3)\n#> P scales 1.2.1 2022-08-20 [?] RSPM (R 4.1.3)\n#> P sessioninfo 1.2.2 2021-12-06 [?] CRAN (R 4.1.3)\n#> P smoothqr 0.1.1 2023-06-20 [?] Github (dajmcdon/smoothqr@3def5f0)\n#> P SparseM 1.81 2021-02-18 [?] RSPM (R 4.1.3)\n#> P stringi 1.7.12 2023-01-11 [?] RSPM (R 4.1.3)\n#> P stringr * 1.5.0 2022-12-02 [?] RSPM (R 4.1.3)\n#> P styler 1.10.1 2023-06-05 [?] RSPM (R 4.1.3)\n#> P survival 3.5-5 2023-03-12 [?] RSPM (R 4.1.3)\n#> P tibble * 3.2.1 2023-03-20 [?] RSPM (R 4.1.3)\n#> P tidyr * 1.3.0 2023-01-24 [?] RSPM (R 4.1.3)\n#> P tidyselect 1.2.0 2022-10-10 [?] RSPM (R 4.1.3)\n#> P tidyverse * 2.0.0 2023-02-22 [?] RSPM\n#> P timechange 0.2.0 2023-01-11 [?] CRAN (R 4.1.3)\n#> P timeDate 4022.108 2023-01-07 [?] RSPM\n#> P tsibble 1.1.3 2022-10-09 [?] RSPM (R 4.1.3)\n#> P tzdb 0.4.0 2023-05-12 [?] RSPM (R 4.1.3)\n#> P usethis 2.2.2 2023-07-06 [?] RSPM (R 4.1.3)\n#> P utf8 1.2.3 2023-01-31 [?] RSPM (R 4.1.3)\n#> P vctrs 0.6.2 2023-04-19 [?] CRAN (R 4.1.3)\n#> P viridisLite 0.4.2 2023-05-02 [?] RSPM (R 4.1.3)\n#> P withr 2.5.0 2022-03-03 [?] CRAN (R 4.0.5)\n#> P workflows 1.1.3 2023-02-22 [?] RSPM (R 4.1.3)\n#> P xfun 0.39 2023-04-20 [?] RSPM (R 4.1.3)\n#> P xml2 1.3.4 2023-04-27 [?] CRAN (R 4.1.3)\n#> P yaml 2.3.7 2023-01-23 [?] RSPM (R 4.1.3)\n#> \n#> [1] /home/fullname/.cache/R/renv/library/delphi-tooling-book-1266ecb6/R-4.1/x86_64-redhat-linux-gnu\n#> [2] /home/fullname/.cache/R/renv/sandbox/R-4.1/x86_64-redhat-linux-gnu/60c4e220\n#> \n#> P ── Loaded and on-disk path mismatch.\n#> \n#> ───────────────────────────────────────────────────────────────────────────\n```\n:::\n:::\n\n\n
\n\n\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/outliers/execute-results/html.json b/_freeze/outliers/execute-results/html.json index b79f742..bcf951b 100644 --- a/_freeze/outliers/execute-results/html.json +++ b/_freeze/outliers/execute-results/html.json @@ -1,7 +1,7 @@ { "hash": "2c87fd6b2160a0e6e82f132ff958832d", "result": { - "markdown": "# Detect and correct outliers in signals\n\nThis chapter describes functionality for detecting and correcting outliers in\nsignals in the `detect_outlr()` and `correct_outlr()` functions provided in the\n`epiprocess` package. These functions is designed to be modular and extendable,\nso that you can define your own outlier detection and correction routines and\napply them to `epi_df` objects. We'll demonstrate this using state-level daily\nreported COVID-19 case counts from FL and NJ.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-2_a04e38e37e2a0cee4145786b428621e0'}\n\n```{.r .cell-code}\nx <- incidence_num_outlier_example\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-3_eeb1c583efb1d858ceb57a9c288edf2e'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases, color = geo_value)) +\n geom_line() +\n scale_color_manual(values = c(3, 6)) +\n geom_hline(yintercept = 0, linetype = 3) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-3-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThere are multiple outliers in these data that a modeler may want to detect and\ncorrect. We'll discuss those two tasks in turn.\n\n## Outlier detection\n\nThe `detect_outlr()` function allows us to run multiple outlier detection\nmethods on a given signal, and then (optionally) combine the results from those\nmethods. Here, we'll investigate outlier detection results from the following\nmethods.\n\n1. Detection based on a rolling median, using `detect_outlr_rm()`, which \n computes a rolling median on with a default window size of `n` time points \n centered at the time point under consideration, and then computes thresholds \n based on a multiplier times a rolling IQR computed on the residuals. \n2. Detection based on a seasonal-trend decomposition using LOESS (STL), using\n `detect_outlr_stl()`, which is similar to the rolling median method but \n replaces the rolling median with fitted values from STL. \n3. Detection based on an STL decomposition, but without seasonality term, which\n amounts to smoothing using LOESS.\n\nThe outlier detection methods are specified using a `tibble` that is passed to\n`detect_outlr()`, with one row per method, and whose columms specify the\noutlier detection function, any input arguments (only nondefault values need to\nbe supplied), and an abbreviated name for the method used in tracking results.\nAbbreviations \"rm\" and \"stl\" can be used for the built-in detection functions \n`detect_outlr_rm()` and `detect_outlr_stl()`, respectively.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-4_d718f9b3ce1f2b62cbbefda0c73956dc'}\n\n```{.r .cell-code}\ndetection_methods <- bind_rows(\n tibble(\n method = \"rm\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5\n )),\n abbr = \"rm\"\n ),\n tibble(\n method = \"stl\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5,\n seasonal_period = 7\n )),\n abbr = \"stl_seasonal\"\n ),\n tibble(\n method = \"stl\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5,\n seasonal_period = NULL\n )),\n abbr = \"stl_nonseasonal\"\n )\n)\n\ndetection_methods\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 3 × 3\n#> method args abbr \n#> \n#> 1 rm rm \n#> 2 stl stl_seasonal \n#> 3 stl stl_nonseasonal\n```\n:::\n:::\n\n\nAdditionally, we'll form combined lower and upper thresholds, calculated as the\nmedian of the lower and upper thresholds from the methods at each time point.\nNote that using this combined median threshold is equivalent to using a majority\nvote across the base methods to determine whether a value is an outlier.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-5_8b0c1909c0789a5ed4ad41dc03bdbcc0'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(\n outlier_info = detect_outlr(\n x = time_value, y = cases,\n methods = detection_methods,\n combiner = \"median\"\n )\n ) %>%\n ungroup() %>%\n unnest(outlier_info)\n\nx\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 730 x 15 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-21 15:17:14.962335\n#> \n#> # A tibble: 730 × 15\n#> geo_value time_value cases rm_lower rm_upper rm_replacement\n#> * \n#> 1 fl 2020-06-01 667 345 2195 667\n#> 2 nj 2020-06-01 486 64.4 926. 486\n#> 3 fl 2020-06-02 617 406. 2169. 617\n#> 4 nj 2020-06-02 658 140. 841. 658\n#> 5 fl 2020-06-03 1317 468. 2142. 1317\n#> 6 nj 2020-06-03 541 216 756 541\n#> # ℹ 724 more rows\n#> # ℹ 9 more variables: stl_seasonal_lower , stl_seasonal_upper , …\n```\n:::\n:::\n\n\nTo visualize the results, we define a convenience function for and call it on \neach state separately (hidden below the fold).\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-6_b18b51621bf6de1276da27cedba0106c'}\n\n```{.r .cell-code code-fold=\"true\"}\n# Plot outlier detection bands and/or points identified as outliers\nplot_outlr <- function(\n x, signal, method_abbr, bands = TRUE, points = TRUE,\n facet_vars = vars(geo_value), nrow = NULL, ncol = NULL,\n scales = \"fixed\") {\n # Convert outlier detection results to long format\n signal <- rlang::enquo(signal)\n x_long <- x %>%\n pivot_longer(\n cols = starts_with(method_abbr),\n names_to = c(\"method\", \".value\"),\n names_pattern = \"(.+)_(.+)\"\n )\n\n # Start of plot with observed data\n p <- ggplot() +\n geom_line(data = x, mapping = aes(x = time_value, y = !!signal))\n\n # If requested, add bands\n if (bands) {\n p <- p + geom_ribbon(\n data = x_long,\n aes(\n x = time_value, ymin = lower, ymax = upper,\n color = method\n ), fill = NA\n )\n }\n\n # If requested, add points\n if (points) {\n x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper))\n p <- p + geom_point(\n data = x_detected,\n aes(\n x = time_value, y = !!signal, color = method,\n shape = method\n )\n )\n }\n\n # If requested, add faceting\n if (!is.null(facet_vars)) {\n p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales)\n }\n\n return(p)\n}\n```\n:::\n\n\nNow we produce plots for each state at a time, faceting by the detection method.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-7_5405f07bf3cd51cfe2ca67bccef65fe0'}\n\n```{.r .cell-code code-fold=\"true\"}\nmethod_abbr <- c(detection_methods$abbr, \"combined\")\n\nplot_outlr(x %>% filter(geo_value == \"fl\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"Florida\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-1.svg){fig-align='center' width=90%}\n:::\n\n```{.r .cell-code code-fold=\"true\"}\nplot_outlr(x %>% filter(geo_value == \"nj\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"New Jersey\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-2.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Outlier correction\n\nFinally, in order to correct outliers, we can use the posited replacement values\nreturned by each outlier detection method. Below we use the replacement value\nfrom the combined method, which is defined by the median of replacement values \nfrom the base methods at each time point.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-8_747bff7bae49f5f0304632fd3b1558a9'}\n\n```{.r .cell-code}\ny <- x %>%\n mutate(cases_corrected = combined_replacement) %>%\n select(geo_value, time_value, cases, cases_corrected)\n\ny %>% filter(cases != cases_corrected)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 22 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-21 15:17:14.962335\n#> \n#> # A tibble: 22 × 4\n#> geo_value time_value cases cases_corrected\n#> * \n#> 1 fl 2020-07-12 15300 10181 \n#> 2 nj 2020-07-19 -8 320.\n#> 3 nj 2020-08-13 694 404.\n#> 4 nj 2020-08-14 619 397.\n#> 5 nj 2020-08-16 40 366 \n#> 6 nj 2020-08-22 555 360 \n#> # ℹ 16 more rows\n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-9_d88f9e8692dcd1d4f7b70de883a83a80'}\n\n```{.r .cell-code code-fold=\"true\"}\ny %>%\n pivot_longer(starts_with(\"cases\")) %>%\n ggplot(aes(x = time_value)) +\n geom_line(aes(y = value, color = name, linetype = name)) +\n scale_color_brewer(palette = \"Set1\") +\n scale_linetype_manual(values = c(2, 1)) +\n geom_hline(yintercept = 0) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\") +\n theme(legend.position = \"bottom\", legend.title = element_blank())\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nMore advanced correction functionality will be coming at some point in the \nfuture. \n\n", + "markdown": "# Detect and correct outliers in signals\n\nThis chapter describes functionality for detecting and correcting outliers in\nsignals in the `detect_outlr()` and `correct_outlr()` functions provided in the\n`epiprocess` package. These functions is designed to be modular and extendable,\nso that you can define your own outlier detection and correction routines and\napply them to `epi_df` objects. We'll demonstrate this using state-level daily\nreported COVID-19 case counts from FL and NJ.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-2_a04e38e37e2a0cee4145786b428621e0'}\n\n```{.r .cell-code}\nx <- incidence_num_outlier_example\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-3_eeb1c583efb1d858ceb57a9c288edf2e'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases, color = geo_value)) +\n geom_line() +\n scale_color_manual(values = c(3, 6)) +\n geom_hline(yintercept = 0, linetype = 3) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-3-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThere are multiple outliers in these data that a modeler may want to detect and\ncorrect. We'll discuss those two tasks in turn.\n\n## Outlier detection\n\nThe `detect_outlr()` function allows us to run multiple outlier detection\nmethods on a given signal, and then (optionally) combine the results from those\nmethods. Here, we'll investigate outlier detection results from the following\nmethods.\n\n1. Detection based on a rolling median, using `detect_outlr_rm()`, which \n computes a rolling median on with a default window size of `n` time points \n centered at the time point under consideration, and then computes thresholds \n based on a multiplier times a rolling IQR computed on the residuals. \n2. Detection based on a seasonal-trend decomposition using LOESS (STL), using\n `detect_outlr_stl()`, which is similar to the rolling median method but \n replaces the rolling median with fitted values from STL. \n3. Detection based on an STL decomposition, but without seasonality term, which\n amounts to smoothing using LOESS.\n\nThe outlier detection methods are specified using a `tibble` that is passed to\n`detect_outlr()`, with one row per method, and whose columms specify the\noutlier detection function, any input arguments (only nondefault values need to\nbe supplied), and an abbreviated name for the method used in tracking results.\nAbbreviations \"rm\" and \"stl\" can be used for the built-in detection functions \n`detect_outlr_rm()` and `detect_outlr_stl()`, respectively.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-4_d718f9b3ce1f2b62cbbefda0c73956dc'}\n\n```{.r .cell-code}\ndetection_methods <- bind_rows(\n tibble(\n method = \"rm\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5\n )),\n abbr = \"rm\"\n ),\n tibble(\n method = \"stl\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5,\n seasonal_period = 7\n )),\n abbr = \"stl_seasonal\"\n ),\n tibble(\n method = \"stl\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5,\n seasonal_period = NULL\n )),\n abbr = \"stl_nonseasonal\"\n )\n)\n\ndetection_methods\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 3 × 3\n#> method args abbr \n#> \n#> 1 rm rm \n#> 2 stl stl_seasonal \n#> 3 stl stl_nonseasonal\n```\n:::\n:::\n\n\nAdditionally, we'll form combined lower and upper thresholds, calculated as the\nmedian of the lower and upper thresholds from the methods at each time point.\nNote that using this combined median threshold is equivalent to using a majority\nvote across the base methods to determine whether a value is an outlier.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-5_8b0c1909c0789a5ed4ad41dc03bdbcc0'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(\n outlier_info = detect_outlr(\n x = time_value, y = cases,\n methods = detection_methods,\n combiner = \"median\"\n )\n ) %>%\n ungroup() %>%\n unnest(outlier_info)\n\nx\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 730 x 15 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-21 15:17:14\n#> \n#> # A tibble: 730 × 15\n#> geo_value time_value cases rm_lower rm_upper rm_replacement\n#> * \n#> 1 fl 2020-06-01 667 345 2195 667\n#> 2 nj 2020-06-01 486 64.4 926. 486\n#> 3 fl 2020-06-02 617 406. 2169. 617\n#> 4 nj 2020-06-02 658 140. 841. 658\n#> 5 fl 2020-06-03 1317 468. 2142. 1317\n#> 6 nj 2020-06-03 541 216 756 541\n#> # ℹ 724 more rows\n#> # ℹ 9 more variables: stl_seasonal_lower , stl_seasonal_upper , …\n```\n:::\n:::\n\n\nTo visualize the results, we define a convenience function for and call it on \neach state separately (hidden below the fold).\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-6_b18b51621bf6de1276da27cedba0106c'}\n\n```{.r .cell-code code-fold=\"true\"}\n# Plot outlier detection bands and/or points identified as outliers\nplot_outlr <- function(\n x, signal, method_abbr, bands = TRUE, points = TRUE,\n facet_vars = vars(geo_value), nrow = NULL, ncol = NULL,\n scales = \"fixed\") {\n # Convert outlier detection results to long format\n signal <- rlang::enquo(signal)\n x_long <- x %>%\n pivot_longer(\n cols = starts_with(method_abbr),\n names_to = c(\"method\", \".value\"),\n names_pattern = \"(.+)_(.+)\"\n )\n\n # Start of plot with observed data\n p <- ggplot() +\n geom_line(data = x, mapping = aes(x = time_value, y = !!signal))\n\n # If requested, add bands\n if (bands) {\n p <- p + geom_ribbon(\n data = x_long,\n aes(\n x = time_value, ymin = lower, ymax = upper,\n color = method\n ), fill = NA\n )\n }\n\n # If requested, add points\n if (points) {\n x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper))\n p <- p + geom_point(\n data = x_detected,\n aes(\n x = time_value, y = !!signal, color = method,\n shape = method\n )\n )\n }\n\n # If requested, add faceting\n if (!is.null(facet_vars)) {\n p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales)\n }\n\n return(p)\n}\n```\n:::\n\n\nNow we produce plots for each state at a time, faceting by the detection method.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-7_5405f07bf3cd51cfe2ca67bccef65fe0'}\n\n```{.r .cell-code code-fold=\"true\"}\nmethod_abbr <- c(detection_methods$abbr, \"combined\")\n\nplot_outlr(x %>% filter(geo_value == \"fl\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"Florida\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-1.svg){fig-align='center' width=90%}\n:::\n\n```{.r .cell-code code-fold=\"true\"}\nplot_outlr(x %>% filter(geo_value == \"nj\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"New Jersey\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-2.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Outlier correction\n\nFinally, in order to correct outliers, we can use the posited replacement values\nreturned by each outlier detection method. Below we use the replacement value\nfrom the combined method, which is defined by the median of replacement values \nfrom the base methods at each time point.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-8_747bff7bae49f5f0304632fd3b1558a9'}\n\n```{.r .cell-code}\ny <- x %>%\n mutate(cases_corrected = combined_replacement) %>%\n select(geo_value, time_value, cases, cases_corrected)\n\ny %>% filter(cases != cases_corrected)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 22 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-21 15:17:14\n#> \n#> # A tibble: 22 × 4\n#> geo_value time_value cases cases_corrected\n#> * \n#> 1 fl 2020-07-12 15300 10181 \n#> 2 nj 2020-07-19 -8 320.\n#> 3 nj 2020-08-13 694 404.\n#> 4 nj 2020-08-14 619 397.\n#> 5 nj 2020-08-16 40 366 \n#> 6 nj 2020-08-22 555 360 \n#> # ℹ 16 more rows\n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-9_d88f9e8692dcd1d4f7b70de883a83a80'}\n\n```{.r .cell-code code-fold=\"true\"}\ny %>%\n pivot_longer(starts_with(\"cases\")) %>%\n ggplot(aes(x = time_value)) +\n geom_line(aes(y = value, color = name, linetype = name)) +\n scale_color_brewer(palette = \"Set1\") +\n scale_linetype_manual(values = c(2, 1)) +\n geom_hline(yintercept = 0) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\") +\n theme(legend.position = \"bottom\", legend.title = element_blank())\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nMore advanced correction functionality will be coming at some point in the \nfuture. \n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/preprocessing-and-models/execute-results/html.json b/_freeze/preprocessing-and-models/execute-results/html.json index 4c10b66..288e244 100644 --- a/_freeze/preprocessing-and-models/execute-results/html.json +++ b/_freeze/preprocessing-and-models/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "6adfd7f5b56b89ea64597f2ceaa58af6", + "hash": "ff6d7f3fc9125df42623428c443d0442", "result": { - "markdown": "# Examples of Preprocessing and Models\n\n\n::: {.cell hash='preprocessing-and-models_cache/html/unnamed-chunk-1_381118b4e9b3d94744097935259e52ef'}\n\n:::\n\n\n\n## Introduction \n\nThe `epipredict` package uses the `tidymodels` framework, namely \n[`{recipes}`](https://recipes.tidymodels.org/) for \n[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences \nof feature engineering and [`{parsnip}`](https://parsnip.tidymodels.org/) \nfor a unified interface to a range of models. \n\n`epipredict` has additional customized feature engineering and preprocessing \nsteps that specifically work with panel data in this context, for example,\n`step_epi_lag()`, `step_population_scaling()`, \n`step_epi_naomit()`. They can be used along with most\nsteps from the `{recipes}` package for more feature engineering. \n\nIn this vignette, we will illustrate some examples of how to use `epipredict`\nwith `recipes` and `parsnip` for different purposes of \nepidemiological forecasting.\nWe will focus on basic autoregressive models, in which COVID cases and \ndeaths in the near future are predicted using a linear combination of cases\nand deaths in the near past.\n\nThe remaining vignette will be split into three sections. In the first\nsection, we \nwill use a Poisson regression to predict death counts. In the second section,\nwe will use a linear regression to predict death rates. Last but not least, we\nwill create a classification model for hotspot predictions. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-2_addbfa8195f76724bf5d26a47c5098c2'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epipredict)\nlibrary(recipes)\nlibrary(workflows)\nlibrary(poissonreg)\n```\n:::\n\n\n## Poisson Regression \n\nDuring COVID-19, the U.S. Centers for Disease Control and Prevention (CDC) \ncollected models\nand forecasts to characterize the state of an outbreak and its course. They use\nit to inform public health decision makers on potential consequences of \ndeploying control measures.\n\nOne of the outcomes that the CDC forecasts is [death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html).\nAlthough there are many state-of-the-art models, we choose to use Poisson \nregression, the textbook example for modeling count data, as an illustration\nfor using the `epipredict` package with other existing `{tidymodels}` packages. \n\nThe (folded) code below gives the necessary commands to download this data\nfrom the Delphi Epidata API, but it is also built into the\n[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/reference/counts_subset.html)\npackage.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/poisson-reg-data_569f553a0c03c619ba69298e3a4c936c'}\n\n```{.r .cell-code code-fold=\"true\"}\ngeos <- c(\"ca\", \"fl\", \"tx\", \"ny\", \"nj\")\nx <- covidcast(\n data_source = \"jhu-csse\",\n signals = \"confirmed_incidence_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n fetch() %>%\n select(geo_value, time_value, cases = value)\n\ny <- covidcast(\n data_source = \"jhu-csse\",\n signals = \"deaths_incidence_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n fetch() %>%\n select(geo_value, time_value, deaths = value)\n\ncounts_subset <- full_join(x, y, by = c(\"geo_value\", \"time_value\")) %>%\n as_epi_df()\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-3_6ecff77ccdcb29ad29987c08784305d2'}\n\n```{.r .cell-code}\ndata(counts_subset, package = \"epidatasets\")\n```\n:::\n\n\nThe `counts_subset` dataset\ncontains the number of confirmed cases and deaths from June 4, 2021 to \nDec 31, 2021 in some U.S. states. \n\nWe wish to predict the 7-day ahead death counts with lagged cases and deaths.\nFurthermore, we will let each state be a dummy variable. Using differential \nintercept coefficients, we can allow for an intercept shift between states.\n\nOne possible model takes the form\n\\begin{aligned}\n\\log\\left( \\mu_{t+7} \\right) &{}= \\beta_0 + \\delta_1 s_{\\text{state}_1} +\n\\delta_2 s_{\\text{state}_2} + \\cdots + \\nonumber \\\\ &\\quad\\beta_1 \\text{deaths}_{t} + \n\\beta_2 \\text{deaths}_{t-7} + \\beta_3 \\text{cases}_{t} + \n\\beta_4 \\text{cases}_{t-7},\n\\end{aligned}\nwhere $\\mu_{t+7} = \\mathbb{E}(\\text{deaths}_{t+7})$, and $\\text{deaths}_{t+7}$\nis assumed to follow a Poisson distribution with mean $\\mu_{t+7}$;\n$s_{\\text{state}}$ are dummy variables for each state and take values of either\n0 or 1.\n\nPreprocessing steps will be performed to prepare the\ndata for model fitting. But before diving into them, it will be helpful to understand what `roles` are in the `recipes` framework. \n\n---\n\n#### Aside on `recipes` {.unnumbered}\n\n`recipes` can assign one or more roles to each column in the data. The roles \nare not restricted to a predefined set; they can be anything. \nFor most conventional situations, they are typically “predictor” and/or \n\"outcome\". Additional roles enable targeted `step_*()` operations on specific \nvariables or groups of variables.\n\nIn our case, the role `predictor` is given to explanatory variables on the\nright-hand side of the model (in the equation above). \nThe role `outcome` is the response variable \nthat we wish to predict. `geo_value` and `time_value` are predefined roles \nthat are unique to the `epipredict` package. Since we work with `epi_df` \nobjects, all datasets should have `geo_value` and `time_value` passed through\nautomatically with these two roles assigned to the appropriate columns in the data.\n \nThe `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) \nin bulk. There are a few handy functions that can be used together to help us \nmanipulate variable roles easily. \n\n> `update_role()` alters an existing role in the recipe or assigns an initial role \n> to variables that do not yet have a declared role.\n> \n> `add_role()` adds an additional role to variables that already have a role in \n> the recipe, without overwriting old roles.\n> \n> `remove_role()` eliminates a single existing role in the recipe.\n\n#### End aside {.unnumbered}\n\n---\n\nNotice in the following preprocessing steps, we used `add_role()` on \n`geo_value_factor` since, currently, the default role for it is `raw`, but\nwe would like to reuse this variable as a `predictor`.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-4_90b277143c4c937a1680363162df6b8b'}\n\n```{.r .cell-code}\ncounts_subset <- counts_subset %>%\n mutate(geo_value_factor = as.factor(geo_value)) %>%\n as_epi_df()\n\nepi_recipe(counts_subset)\n\nr <- epi_recipe(counts_subset) %>%\n add_role(geo_value_factor, new_role = \"predictor\") %>%\n step_dummy(geo_value_factor) %>%\n ## Occasionally, data reporting errors / corrections result in negative\n ## cases / deaths\n step_mutate(cases = pmax(cases, 0), deaths = pmax(deaths, 0)) %>%\n step_epi_lag(cases, deaths, lag = c(0, 7)) %>%\n step_epi_ahead(deaths, ahead = 7, role = \"outcome\") %>%\n step_epi_naomit()\n```\n:::\n\n\nAfter specifying the preprocessing steps, we will use the `parsnip` package for\nmodeling and producing the prediction for death count, 7 days after the\nlatest available date in the dataset. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-5_343ca6ec29d09be57ab611380fba40c7'}\n\n```{.r .cell-code}\nlatest <- get_test_data(r, counts_subset)\n\nwf <- epi_workflow(r, parsnip::poisson_reg()) %>%\n fit(counts_subset)\n\npredict(wf, latest) %>% filter(!is.na(.pred))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 5 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-06-07 16:52:32.877214\n#> \n#> # A tibble: 5 × 3\n#> geo_value time_value .pred\n#> * \n#> 1 ca 2021-12-31 108. \n#> 2 fl 2021-12-31 270. \n#> 3 nj 2021-12-31 22.5\n#> 4 ny 2021-12-31 94.8\n#> 5 tx 2021-12-31 91.0\n```\n:::\n:::\n\n\nNote that the `time_value` corresponds to the date(s) in the \ntest set `latest`, **NOT** to the target date of the forecast (2022-01-07). Had we used different data for predictions,\nwe would have gotten different `time_value`'s.\n\nLet's take a look at the fit:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-6_31441e0d10c16615d3182594c1fec30f'}\n\n```{.r .cell-code}\nextract_fit_engine(wf)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> Call: stats::glm(formula = ..y ~ ., family = stats::poisson, data = data)\n#> \n#> Coefficients:\n#> (Intercept) geo_value_factor_fl geo_value_factor_nj \n#> 3.970e+00 -1.487e-01 -1.425e+00 \n#> geo_value_factor_ny geo_value_factor_tx lag_0_cases \n#> -6.865e-01 3.025e-01 1.339e-05 \n#> lag_7_cases lag_0_deaths lag_7_deaths \n#> 1.717e-06 1.731e-03 8.566e-04 \n#> \n#> Degrees of Freedom: 984 Total (i.e. Null); 976 Residual\n#> Null Deviance:\t 139600 \n#> Residual Deviance: 58110 \tAIC: 62710\n```\n:::\n:::\n\n\nAlternative forms of Poisson regression or particular computational approaches\ncan be applied via arguments to `parsnip::poisson_reg()` for some common\nsettings, and by using `parsnip::set_engine()` to use a specific Poisson\nregression engine and to provide additional engine-specific customization.\n\n\n\n## Linear Regression \n\nFor COVID-19, the CDC required submission of case and death count predictions. \nHowever, the Delphi Group preferred to train on rate data instead, because it \nputs different locations on a similar scale (eliminating the need for location-specific intercepts). \nWe can use a linear regression to predict the death rates and use state\npopulation data to scale the rates to counts.[^pois] We will do so using\n`layer_population_scaling()` from the `epipredict` package. (We could also use\n`step_population_scaling()` from the `epipredict` package to prepare rate data\nfrom count data in the preprocessing recipe.)\n\n[^pois]: We could continue with the Poisson model, but we'll switch to the Gaussian likelihood just for simplicity.\n\nAdditionally, when forecasts are submitted, prediction intervals should be \nprovided along with the point estimates. This can be obtained via postprocessing\nusing\n`layer_residual_quantiles()`. It is worth pointing out, however, that \n`layer_residual_quantiles()` should be used before population scaling or else \nthe transformation will make the results uninterpretable. \n\nWe wish, now, to predict the 7-day ahead death counts with lagged case rates and death\nrates, along with some extra behaviourial predictors. Namely, we will use survey data\nfrom [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators).\n\nThe survey data provides the estimated percentage of people who wore a mask for \nmost or all of the time while in public in the past 7 days and the estimated \npercentage of respondents who reported that all or most people they encountered \nin public in the past 7 days maintained a distance of at least 6 feet. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-7_e6a1705f9786970a292c5ed2f10ee461'}\n\n```{.r .cell-code code-fold=\"true\"}\n# Download the raw data as used in {epidatasets}\nbehav_ind_mask <- covidcast(\n data_source = \"fb-survey\",\n signals = \"smoothed_wwearing_mask_7d\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n fetch() %>%\n select(geo_value, time_value, masking = value)\n\nbehav_ind_distancing <- covidcast(\n data_source = \"fb-survey\",\n signals = \"smoothed_wothers_distanced_public\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n fetch() %>%\n select(geo_value, time_value, distancing = value)\n\nctis_covid_behaviours <- behav_ind_mask %>%\n full_join(behav_ind_distancing, by = c(\"geo_value\", \"time_value\"))\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-8_dbf1236064e03ae10b6694621ce91509'}\n\n```{.r .cell-code}\ndata(ctis_covid_behaviours, package = \"epidatasets\")\npop_dat <- state_census %>% select(abbr, pop)\n```\n:::\n\n\nState-wise population data from the 2019 U.S. Census is\navailable from `{epipredict}` and will be used in `layer_population_scaling()`.\n\n\n\nRather than using raw mask-wearing / social-distancing metrics, for the sake\nof illustration, we'll convert both into categorical predictors.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-9_fc946e21bc9fa1be4fd6bab0666dd515'}\n::: {.cell-output-display}\n![](preprocessing-and-models_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe will take a subset of death rate and case rate data from the built-in dataset \n`case_death_rate_subset`.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-10_80a089a8a6d7a1e74830826cacc5871c'}\n\n```{.r .cell-code}\njhu <- filter(\n case_death_rate_subset,\n time_value >= \"2021-06-04\",\n time_value <= \"2021-12-31\",\n geo_value %in% c(\"ca\", \"fl\", \"tx\", \"ny\", \"nj\")\n)\n```\n:::\n\n\nPreprocessing steps will again rely on functions from the `epipredict` package \nas well as the `recipes` package.\nThere are also many functions in the `recipes` package that allow for \n[scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),\nsuch as log transformations and data centering. In our case, we will \ncenter the numerical predictors to allow for a more meaningful interpretation of\nthe intercept. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-11_2fb56af2f7c6d9d8b32c7c071d1446c6'}\n\n```{.r .cell-code}\njhu <- jhu %>%\n mutate(geo_value_factor = as.factor(geo_value)) %>%\n left_join(ctis_covid_behaviours, by = c(\"geo_value\", \"time_value\")) %>%\n as_epi_df()\n\nr <- epi_recipe(jhu) %>%\n add_role(geo_value_factor, new_role = \"predictor\") %>%\n step_dummy(geo_value_factor) %>%\n step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%\n step_mutate(\n masking = cut_number(masking, 5),\n distancing = cut_number(distancing, 5)\n ) %>%\n step_epi_ahead(death_rate, ahead = 7, role = \"outcome\") %>%\n step_center(contains(\"lag\"), role = \"predictor\") %>%\n step_epi_naomit()\n```\n:::\n\n\nAs a sanity check we can examine the structure of the training data:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-12_fcfc398d986b21bb83903a1497923da1'}\n\n```{.r .cell-code}\nglimpse(bake(prep(r, jhu), jhu))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Rows: 985\n#> Columns: 17\n#> $ time_value 2021-06-18, 2021-06-18, 2021-06-18, 2021-06-18…\n#> $ geo_value \"ca\", \"fl\", \"nj\", \"ny\", \"tx\", \"ca\", \"fl\", \"nj\",…\n#> $ case_rate 2.382641, 6.635633, 2.771139, 1.959257, 3.50565…\n#> $ death_rate 0.0373762, 0.1906224, 0.0707662, 0.0554089, 0.0…\n#> $ masking \"(69.7,85]\", \"(52.8,60.2]\", \"(60.2,63.9]\", \"(60…\n#> $ distancing \"(27,43]\", \"(21.1,27]\", \"(27,43]\", \"(27,43]\", \"…\n#> $ geo_value_factor_fl 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,…\n#> $ geo_value_factor_nj 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,…\n#> $ geo_value_factor_ny 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…\n#> $ geo_value_factor_tx 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,…\n#> $ lag_0_case_rate -24.55902, -20.30603, -24.17052, -24.98241, -23…\n#> $ lag_7_case_rate -24.28505, -17.44078, -23.74271, -24.00795, -19…\n#> $ lag_14_case_rate -24.61817, -20.99358, -24.55491, -23.72352, -22…\n#> $ lag_0_death_rate -0.2444974, -0.0912512, -0.2111074, -0.2264647,…\n#> $ lag_7_death_rate -0.1875259, -0.0978243, -0.1869826, -0.2035624,…\n#> $ lag_14_death_rate -0.1980493, -0.1431793, -0.1532078, -0.1651456,…\n#> $ ahead_7_death_rate 0.1037824, 0.1426382, 0.0964993, 0.0347229, 0.0…\n```\n:::\n:::\n\n\nBefore directly predicting the results, we need to add postprocessing layers to\nobtain the death counts instead of death rates. Note that the rates used so\nfar are \"per 100K people\" rather than \"per person\". We'll also use quantile\nregression with the `quantile_reg` engine rather than ordinary least squares\nto create median predictions and a 90% prediction interval.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-13_bed06d2db0383e9df0ccd38e405db06e'}\n\n```{.r .cell-code}\nf <- frosting() %>%\n layer_predict() %>%\n layer_add_target_date(\"2022-01-07\") %>%\n layer_add_forecast_date() %>%\n layer_threshold(.pred, lower = 0) %>%\n layer_quantile_distn() %>%\n layer_point_from_distn() %>%\n layer_naomit(.pred) %>%\n layer_population_scaling(\n contains(\".pred\"),\n df = pop_dat,\n rate_rescaling = 1e5,\n by = c(\"geo_value\" = \"abbr\"),\n df_pop_col = \"pop\"\n )\n\nwf <- epi_workflow(r, quantile_reg(tau = c(.05, .5, .95))) %>%\n fit(jhu) %>%\n add_frosting(f)\n\nlatest <- get_test_data(recipe = r, x = jhu)\np <- predict(wf, latest) %>%\n select(-time_value) %>%\n as_tibble()\np\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 5 × 7\n#> geo_value .pred target_date forecast_date .pred_distn .pred_scaled\n#> \n#> 1 ca 0.181 2022-01-07 2021-12-31 [0.25, 0.75] 71.6\n#> 2 fl 0.348 2022-01-07 2021-12-31 [0.25, 0.75] 74.7\n#> 3 nj 0.646 2022-01-07 2021-12-31 [0.25, 0.75] 57.4\n#> 4 ny 0.698 2022-01-07 2021-12-31 [0.25, 0.75] 136. \n#> 5 tx 0.299 2022-01-07 2021-12-31 [0.25, 0.75] 86.8\n#> # ℹ 1 more variable: .pred_distn_scaled \n```\n:::\n:::\n\n\nThe columns marked `*_scaled` (unfortunately, some of these\nare hidden above) \nhave been rescaled to the correct units, in this\ncase `deaths` rather than deaths per 100K people (these remain in `.pred`).\n\nTo look at the prediction intervals:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-14_f587c993b2de8fc747d934406aa738a6'}\n\n```{.r .cell-code}\np %>%\n select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>%\n pivot_quantiles(.pred_distn_scaled)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 5 × 5\n#> geo_value target_date .pred_scaled `0.25` `0.75`\n#> \n#> 1 ca 2022-01-07 71.6 48.8 94.0\n#> 2 fl 2022-01-07 74.7 48.4 104. \n#> 3 nj 2022-01-07 57.4 45.5 68.7\n#> 4 ny 2022-01-07 136. 108. 163. \n#> 5 tx 2022-01-07 86.8 68.6 107.\n```\n:::\n:::\n\n\n\nLast but not least, let's take a look at the regression fit and check the \ncoefficients:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-15_ba04dff1d70f15ab185e944bdd928a86'}\n::: {.cell-output .cell-output-stdout}\n```\n#> Call:\n#> quantreg::rq(formula = ..y ~ ., tau = ~c(0.05, 0.5, 0.95), data = data, \n#> na.action = stats::na.omit, method = \"br\", model = FALSE)\n#> \n#> Coefficients:\n#> tau= 0.05 tau= 0.50 tau= 0.95\n#> (Intercept) 0.210811625 0.2962574475 0.417583265\n#> geo_value_factor_fl 0.032085820 0.0482361119 0.171126713\n#> geo_value_factor_nj 0.007313762 -0.0033797953 -0.025251865\n#> geo_value_factor_ny -0.001489163 -0.0199485947 -0.032635584\n#> geo_value_factor_tx 0.029077485 0.0391980273 0.071961515\n#> lag_0_case_rate -0.001636588 -0.0011625693 -0.001430622\n#> lag_7_case_rate 0.004700752 0.0057822095 0.006912655\n#> lag_14_case_rate 0.001715816 0.0004224753 0.003448733\n#> lag_0_death_rate 0.462341754 0.5274192012 0.164856372\n#> lag_7_death_rate -0.007368501 0.1132903956 0.172687438\n#> lag_14_death_rate -0.072500707 -0.0270474349 0.181279299\n#> \n#> Degrees of freedom: 950 total; 939 residual\n```\n:::\n:::\n\n\n## Classification\n\nSometimes it is preferable to create a predictive model for surges or upswings\nrather than for raw values. In this case,\nthe target is to predict if the future will have increased case rates (denoted `up`),\ndecreased case rates (`down`), or flat case rates (`flat`) relative to the current\nlevel. Such models may be \nreferred to as \"hotspot prediction models\". We will follow the analysis \nin [McDonald, Bien, Green, Hu, et al.](#references) but extend the application\nto predict three categories instead of two. \n\nHotspot prediction uses a categorical outcome variable defined in terms of the \nrelative change of $Y_{\\ell, t+a}$ compared to $Y_{\\ell, t}$. \nWhere $Y_{\\ell, t}$ denotes the case rates in location $\\ell$ at time $t$. \nWe define the response variables as follows:\n\n$$\n Z_{\\ell, t}=\n \\begin{cases}\n \\text{up}, & \\text{if}\\ Y^{\\Delta}_{\\ell, t} > 0.25 \\\\ \n \\text{down}, & \\text{if}\\ Y^{\\Delta}_{\\ell, t} < -0.20\\\\\n \\text{flat}, & \\text{otherwise}\n \\end{cases}\n$$\n\nwhere $Y^{\\Delta}_{\\ell, t} = (Y_{\\ell, t}- Y_{\\ell, t-7})\\ /\\ (Y_{\\ell, t-7})$. \nWe say location $\\ell$ is a hotspot at time $t$ when $Z_{\\ell,t}$ is \n`up`, meaning the number of newly reported cases over the past 7 days has \nincreased by at least 25% compared to the preceding week. When $Z_{\\ell,t}$ \nis categorized as `down`, it suggests that there has been at least a 20% \ndecrease in newly reported cases over the past 7 days (a 20% decrease is the inverse of a 25% increase). Otherwise, we will \nconsider the trend to be `flat`. \n\nThe expression of the multinomial regression we will use is as follows:\n$$\n\\pi_{j}(x) = \\text{Pr}(Z_{\\ell,t} = j|x) = \\frac{e^{g_j(x)}}{1 + \\sum_{k=0}^2 g_j(x) }\n$$\nwhere $j$ is either down, flat, or up\n\n$$\n\\begin{aligned}\ng_{\\text{down}}(x) &= 0,\\\\\ng_{\\text{flat}}(x) &= \n\\log\\left(\\frac{Pr(Z_{\\ell,t}=\\text{flat}|x)}{Pr(Z_{\\ell,t}=\\text{down}|x)}\\right) = \n\\beta_{10} + \\beta_{11}t + \\delta_{10} s_{\\text{state}_1} +\n\\delta_{11} s_{\\text{state}_2} + \\cdots \\nonumber \\\\\n&\\quad +\\ \\beta_{12} Y^{\\Delta}_{\\ell, t} +\n\\beta_{13} Y^{\\Delta}_{\\ell, t-7}, \\\\\ng_{\\text{flat}}(x) &= \\log\\left(\\frac{Pr(Z_{\\ell,t}=\\text{up}|x)}{Pr(Z_{\\ell,t}=\\text{down}|x)}\\right) = \n\\beta_{20} + \\beta_{21}t + \\delta_{20} s_{\\text{state}_1} +\n\\delta_{21} s_{\\text{state}_2} + \\cdots \\nonumber \\\\\n&\\quad +\\ \\beta_{22} Y^{\\Delta}_{\\ell, t} +\n\\beta_{23} Y^{\\Delta}_{\\ell, t-7}.\n\\end{aligned}\n$$\n\n\nPreprocessing steps are similar to the previous models with an additional step \nof categorizing the response variables. Again, we will use a subset of death rate and case rate data from our built-in dataset \n`case_death_rate_subset`.\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-16_6c59ca34a0fd30f1f204b03181f28c88'}\n\n```{.r .cell-code}\njhu_rates <- case_death_rate_subset %>%\n dplyr::filter(\n time_value >= \"2021-06-04\",\n time_value <= \"2021-12-31\",\n geo_value %in% c(\"ca\", \"fl\", \"tx\", \"ny\", \"nj\")\n ) %>%\n mutate(geo_value_factor = as.factor(geo_value))\n\nr <- epi_recipe(jhu_rates) %>%\n add_role(time_value, new_role = \"predictor\") %>%\n step_dummy(geo_value_factor) %>%\n step_growth_rate(case_rate, role = \"none\", prefix = \"gr_\") %>%\n step_epi_lag(starts_with(\"gr_\"), lag = c(0, 7, 14)) %>%\n step_epi_ahead(starts_with(\"gr_\"), ahead = 7, role = \"none\") %>%\n # note recipes::step_cut() has a bug in it, or we could use that here\n step_mutate(\n response = cut(\n ahead_7_gr_7_rel_change_case_rate,\n breaks = c(-Inf, -0.2, 0.25, Inf) / 7, # division gives weekly not daily\n labels = c(\"down\", \"flat\", \"up\")\n ),\n role = \"outcome\"\n ) %>%\n step_rm(has_role(\"none\"), has_role(\"raw\")) %>%\n step_epi_naomit()\n```\n:::\n\n\nWe will fit the multinomial regression and examine the predictions:\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-17_f55e79fcffe78515bd0042409ccfa0bc'}\n\n```{.r .cell-code}\nwf <- epi_workflow(r, parsnip::multinom_reg()) %>%\n fit(jhu_rates)\n\nlatest <- get_test_data(recipe = r, x = jhu_rates)\npredict(wf, latest) %>% filter(!is.na(.pred_class))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 5 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25.791826\n#> \n#> # A tibble: 5 × 3\n#> geo_value time_value .pred_class\n#> * \n#> 1 ca 2021-12-31 up \n#> 2 fl 2021-12-31 up \n#> 3 nj 2021-12-31 up \n#> 4 ny 2021-12-31 up \n#> 5 tx 2021-12-31 up\n```\n:::\n:::\n\n\nWe can also look at the estimated coefficients and model summary information:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-18_118e02a282350ab72791be1f72b553d2'}\n\n```{.r .cell-code}\nextract_fit_engine(wf)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Call:\n#> nnet::multinom(formula = ..y ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) time_value geo_value_factor_fl geo_value_factor_nj\n#> flat -144.2225 0.007754539 -1.3251332 1.137558\n#> up -133.1995 0.007082200 -0.5081323 1.562699\n#> geo_value_factor_ny geo_value_factor_tx lag_0_gr_7_rel_change_case_rate\n#> flat 24.74419 -0.3345769 18.96357\n#> up 24.84975 -0.3176984 33.79521\n#> lag_7_gr_7_rel_change_case_rate lag_14_gr_7_rel_change_case_rate\n#> flat 33.19050 7.157027\n#> up 56.52376 4.684422\n#> \n#> Residual Deviance: 1157.928 \n#> AIC: 1193.928\n```\n:::\n:::\n\n\nOne could also use a formula in `epi_recipe()` to achieve the same results as \nabove. However, only one of `add_formula()`, `add_recipe()`, or \n`workflow_variables()` can be specified. For the purpose of demonstrating \n`add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to\nreturn a `data.frame` that could be used for model fitting.\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-19_95018fa1894b856edd76e784a2756aa6'}\n\n```{.r .cell-code}\nb <- bake(prep(r, jhu_rates), jhu_rates)\n\nepi_workflow() %>%\n add_formula(\n response ~ geo_value + time_value + lag_0_gr_7_rel_change_case_rate +\n lag_7_gr_7_rel_change_case_rate + lag_14_gr_7_rel_change_case_rate\n ) %>%\n add_model(parsnip::multinom_reg()) %>%\n fit(data = b)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Formula\n#> Model: multinom_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> response ~ geo_value + time_value + lag_0_gr_7_rel_change_case_rate + \n#> lag_7_gr_7_rel_change_case_rate + lag_14_gr_7_rel_change_case_rate\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> Call:\n#> nnet::multinom(formula = ..y ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) geo_valuefl geo_valuenj geo_valueny geo_valuetx time_value\n#> flat -144.2169 -1.3265567 1.133930 24.75059 -0.3335109 0.007754346\n#> up -133.3504 -0.5120227 1.559699 24.85666 -0.3158328 0.007090257\n#> lag_0_gr_7_rel_change_case_rate lag_7_gr_7_rel_change_case_rate\n#> flat 19.02258 33.20795\n#> up 33.84665 56.57066\n#> lag_14_gr_7_rel_change_case_rate\n#> flat 7.140357\n#> up 4.668902\n#> \n#> Residual Deviance: 1157.919 \n#> AIC: 1193.919\n```\n:::\n:::\n\n\n\n", + "markdown": "# Examples of Preprocessing and Models\n\n\n::: {.cell}\n\n:::\n\n\n\n## Introduction \n\nThe `epipredict` package uses the `tidymodels` framework, namely \n[`{recipes}`](https://recipes.tidymodels.org/) for \n[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences \nof feature engineering and [`{parsnip}`](https://parsnip.tidymodels.org/) \nfor a unified interface to a range of models. \n\n`epipredict` has additional customized feature engineering and preprocessing \nsteps that specifically work with panel data in this context, for example,\n`step_epi_lag()`, `step_population_scaling()`, \n`step_epi_naomit()`. They can be used along with most\nsteps from the `{recipes}` package for more feature engineering. \n\nIn this vignette, we will illustrate some examples of how to use `epipredict`\nwith `recipes` and `parsnip` for different purposes of \nepidemiological forecasting.\nWe will focus on basic autoregressive models, in which COVID cases and \ndeaths in the near future are predicted using a linear combination of cases\nand deaths in the near past.\n\nThe remaining vignette will be split into three sections. In the first\nsection, we \nwill use a Poisson regression to predict death counts. In the second section,\nwe will use a linear regression to predict death rates. Last but not least, we\nwill create a classification model for hotspot predictions. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-2_addbfa8195f76724bf5d26a47c5098c2'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epipredict)\nlibrary(recipes)\nlibrary(workflows)\nlibrary(poissonreg)\n```\n:::\n\n\n## Poisson Regression \n\nDuring COVID-19, the U.S. Centers for Disease Control and Prevention (CDC) \ncollected models\nand forecasts to characterize the state of an outbreak and its course. They use\nit to inform public health decision makers on potential consequences of \ndeploying control measures.\n\nOne of the outcomes that the CDC forecasts is [death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html).\nAlthough there are many state-of-the-art models, we choose to use Poisson \nregression, the textbook example for modeling count data, as an illustration\nfor using the `epipredict` package with other existing `{tidymodels}` packages. \n\nThe (folded) code below gives the necessary commands to download this data\nfrom the Delphi Epidata API, but it is also built into the\n[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/reference/counts_subset.html)\npackage.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/poisson-reg-data_c5de04bad991fb8b7f75c6079d291fcc'}\n\n```{.r .cell-code code-fold=\"true\"}\ngeos <- c(\"ca\", \"fl\", \"tx\", \"ny\", \"nj\")\nx <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_incidence_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n select(geo_value, time_value, cases = value)\n\ny <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"deaths_incidence_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n select(geo_value, time_value, deaths = value)\n\ncounts_subset <- full_join(x, y, by = c(\"geo_value\", \"time_value\")) %>%\n as_epi_df()\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-3_6ecff77ccdcb29ad29987c08784305d2'}\n\n```{.r .cell-code}\ndata(counts_subset, package = \"epidatasets\")\n```\n:::\n\n\nThe `counts_subset` dataset\ncontains the number of confirmed cases and deaths from June 4, 2021 to \nDec 31, 2021 in some U.S. states. \n\nWe wish to predict the 7-day ahead death counts with lagged cases and deaths.\nFurthermore, we will let each state be a dummy variable. Using differential \nintercept coefficients, we can allow for an intercept shift between states.\n\nOne possible model takes the form\n\\begin{aligned}\n\\log\\left( \\mu_{t+7} \\right) &{}= \\beta_0 + \\delta_1 s_{\\text{state}_1} +\n\\delta_2 s_{\\text{state}_2} + \\cdots + \\nonumber \\\\ &\\quad\\beta_1 \\text{deaths}_{t} + \n\\beta_2 \\text{deaths}_{t-7} + \\beta_3 \\text{cases}_{t} + \n\\beta_4 \\text{cases}_{t-7},\n\\end{aligned}\nwhere $\\mu_{t+7} = \\mathbb{E}(\\text{deaths}_{t+7})$, and $\\text{deaths}_{t+7}$\nis assumed to follow a Poisson distribution with mean $\\mu_{t+7}$;\n$s_{\\text{state}}$ are dummy variables for each state and take values of either\n0 or 1.\n\nPreprocessing steps will be performed to prepare the\ndata for model fitting. But before diving into them, it will be helpful to understand what `roles` are in the `recipes` framework. \n\n---\n\n#### Aside on `recipes` {.unnumbered}\n\n`recipes` can assign one or more roles to each column in the data. The roles \nare not restricted to a predefined set; they can be anything. \nFor most conventional situations, they are typically “predictor” and/or \n\"outcome\". Additional roles enable targeted `step_*()` operations on specific \nvariables or groups of variables.\n\nIn our case, the role `predictor` is given to explanatory variables on the\nright-hand side of the model (in the equation above). \nThe role `outcome` is the response variable \nthat we wish to predict. `geo_value` and `time_value` are predefined roles \nthat are unique to the `epipredict` package. Since we work with `epi_df` \nobjects, all datasets should have `geo_value` and `time_value` passed through\nautomatically with these two roles assigned to the appropriate columns in the data.\n \nThe `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) \nin bulk. There are a few handy functions that can be used together to help us \nmanipulate variable roles easily. \n\n> `update_role()` alters an existing role in the recipe or assigns an initial role \n> to variables that do not yet have a declared role.\n> \n> `add_role()` adds an additional role to variables that already have a role in \n> the recipe, without overwriting old roles.\n> \n> `remove_role()` eliminates a single existing role in the recipe.\n\n#### End aside {.unnumbered}\n\n---\n\nNotice in the following preprocessing steps, we used `add_role()` on \n`geo_value_factor` since, currently, the default role for it is `raw`, but\nwe would like to reuse this variable as a `predictor`.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-4_90b277143c4c937a1680363162df6b8b'}\n\n```{.r .cell-code}\ncounts_subset <- counts_subset %>%\n mutate(geo_value_factor = as.factor(geo_value)) %>%\n as_epi_df()\n\nepi_recipe(counts_subset)\n\nr <- epi_recipe(counts_subset) %>%\n add_role(geo_value_factor, new_role = \"predictor\") %>%\n step_dummy(geo_value_factor) %>%\n ## Occasionally, data reporting errors / corrections result in negative\n ## cases / deaths\n step_mutate(cases = pmax(cases, 0), deaths = pmax(deaths, 0)) %>%\n step_epi_lag(cases, deaths, lag = c(0, 7)) %>%\n step_epi_ahead(deaths, ahead = 7, role = \"outcome\") %>%\n step_epi_naomit()\n```\n:::\n\n\nAfter specifying the preprocessing steps, we will use the `parsnip` package for\nmodeling and producing the prediction for death count, 7 days after the\nlatest available date in the dataset. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-5_343ca6ec29d09be57ab611380fba40c7'}\n\n```{.r .cell-code}\nlatest <- get_test_data(r, counts_subset)\n\nwf <- epi_workflow(r, parsnip::poisson_reg()) %>%\n fit(counts_subset)\n\npredict(wf, latest) %>% filter(!is.na(.pred))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 5 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-06-07 16:52:32\n#> \n#> # A tibble: 5 × 3\n#> geo_value time_value .pred\n#> * \n#> 1 ca 2021-12-31 108. \n#> 2 fl 2021-12-31 270. \n#> 3 nj 2021-12-31 22.5\n#> 4 ny 2021-12-31 94.8\n#> 5 tx 2021-12-31 91.0\n```\n:::\n:::\n\n\nNote that the `time_value` corresponds to the date(s) in the \ntest set `latest`, **NOT** to the target date of the forecast (2022-01-07). Had we used different data for predictions,\nwe would have gotten different `time_value`'s.\n\nLet's take a look at the fit:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-6_31441e0d10c16615d3182594c1fec30f'}\n\n```{.r .cell-code}\nextract_fit_engine(wf)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> Call: stats::glm(formula = ..y ~ ., family = stats::poisson, data = data)\n#> \n#> Coefficients:\n#> (Intercept) geo_value_factor_fl geo_value_factor_nj \n#> 3.970e+00 -1.487e-01 -1.425e+00 \n#> geo_value_factor_ny geo_value_factor_tx lag_0_cases \n#> -6.865e-01 3.025e-01 1.339e-05 \n#> lag_7_cases lag_0_deaths lag_7_deaths \n#> 1.717e-06 1.731e-03 8.566e-04 \n#> \n#> Degrees of Freedom: 984 Total (i.e. Null); 976 Residual\n#> Null Deviance:\t 139600 \n#> Residual Deviance: 58110 \tAIC: 62710\n```\n:::\n:::\n\n\nAlternative forms of Poisson regression or particular computational approaches\ncan be applied via arguments to `parsnip::poisson_reg()` for some common\nsettings, and by using `parsnip::set_engine()` to use a specific Poisson\nregression engine and to provide additional engine-specific customization.\n\n\n\n## Linear Regression \n\nFor COVID-19, the CDC required submission of case and death count predictions. \nHowever, the Delphi Group preferred to train on rate data instead, because it \nputs different locations on a similar scale (eliminating the need for location-specific intercepts). \nWe can use a linear regression to predict the death rates and use state\npopulation data to scale the rates to counts.[^pois] We will do so using\n`layer_population_scaling()` from the `epipredict` package. (We could also use\n`step_population_scaling()` from the `epipredict` package to prepare rate data\nfrom count data in the preprocessing recipe.)\n\n[^pois]: We could continue with the Poisson model, but we'll switch to the Gaussian likelihood just for simplicity.\n\nAdditionally, when forecasts are submitted, prediction intervals should be \nprovided along with the point estimates. This can be obtained via postprocessing\nusing\n`layer_residual_quantiles()`. It is worth pointing out, however, that \n`layer_residual_quantiles()` should be used before population scaling or else \nthe transformation will make the results uninterpretable. \n\nWe wish, now, to predict the 7-day ahead death counts with lagged case rates and death\nrates, along with some extra behaviourial predictors. Namely, we will use survey data\nfrom [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators).\n\nThe survey data provides the estimated percentage of people who wore a mask for \nmost or all of the time while in public in the past 7 days and the estimated \npercentage of respondents who reported that all or most people they encountered \nin public in the past 7 days maintained a distance of at least 6 feet. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-7_fea19afe67d26c42ddc444543855e84a'}\n\n```{.r .cell-code code-fold=\"true\"}\n# Download the raw data as used in {epidatasets}\nbehav_ind_mask <- pub_covidcast(\n source = \"fb-survey\",\n signals = \"smoothed_wwearing_mask_7d\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n select(geo_value, time_value, masking = value)\n\nbehav_ind_distancing <- pub_covidcast(\n source = \"fb-survey\",\n signals = \"smoothed_wothers_distanced_public\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20210604, 20211231),\n geo_values = geos\n) %>%\n select(geo_value, time_value, distancing = value)\n\nctis_covid_behaviours <- behav_ind_mask %>%\n full_join(behav_ind_distancing, by = c(\"geo_value\", \"time_value\"))\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-8_dbf1236064e03ae10b6694621ce91509'}\n\n```{.r .cell-code}\ndata(ctis_covid_behaviours, package = \"epidatasets\")\npop_dat <- state_census %>% select(abbr, pop)\n```\n:::\n\n\nState-wise population data from the 2019 U.S. Census is\navailable from `{epipredict}` and will be used in `layer_population_scaling()`.\n\n\n\nRather than using raw mask-wearing / social-distancing metrics, for the sake\nof illustration, we'll convert both into categorical predictors.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-9_fc946e21bc9fa1be4fd6bab0666dd515'}\n::: {.cell-output-display}\n![](preprocessing-and-models_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe will take a subset of death rate and case rate data from the built-in dataset \n`case_death_rate_subset`.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-10_80a089a8a6d7a1e74830826cacc5871c'}\n\n```{.r .cell-code}\njhu <- filter(\n case_death_rate_subset,\n time_value >= \"2021-06-04\",\n time_value <= \"2021-12-31\",\n geo_value %in% c(\"ca\", \"fl\", \"tx\", \"ny\", \"nj\")\n)\n```\n:::\n\n\nPreprocessing steps will again rely on functions from the `epipredict` package \nas well as the `recipes` package.\nThere are also many functions in the `recipes` package that allow for \n[scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),\nsuch as log transformations and data centering. In our case, we will \ncenter the numerical predictors to allow for a more meaningful interpretation of\nthe intercept. \n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-11_2fb56af2f7c6d9d8b32c7c071d1446c6'}\n\n```{.r .cell-code}\njhu <- jhu %>%\n mutate(geo_value_factor = as.factor(geo_value)) %>%\n left_join(ctis_covid_behaviours, by = c(\"geo_value\", \"time_value\")) %>%\n as_epi_df()\n\nr <- epi_recipe(jhu) %>%\n add_role(geo_value_factor, new_role = \"predictor\") %>%\n step_dummy(geo_value_factor) %>%\n step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%\n step_mutate(\n masking = cut_number(masking, 5),\n distancing = cut_number(distancing, 5)\n ) %>%\n step_epi_ahead(death_rate, ahead = 7, role = \"outcome\") %>%\n step_center(contains(\"lag\"), role = \"predictor\") %>%\n step_epi_naomit()\n```\n:::\n\n\nAs a sanity check we can examine the structure of the training data:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-12_fcfc398d986b21bb83903a1497923da1'}\n\n```{.r .cell-code}\nglimpse(bake(prep(r, jhu), jhu))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Rows: 985\n#> Columns: 17\n#> $ time_value 2021-06-18, 2021-06-18, 2021-06-18, 2021-06-18…\n#> $ geo_value \"ca\", \"fl\", \"nj\", \"ny\", \"tx\", \"ca\", \"fl\", \"nj\",…\n#> $ case_rate 2.382641, 6.635633, 2.771139, 1.959257, 3.50565…\n#> $ death_rate 0.0373762, 0.1906224, 0.0707662, 0.0554089, 0.0…\n#> $ masking \"(69.7,85]\", \"(52.8,60.2]\", \"(60.2,63.9]\", \"(60…\n#> $ distancing \"(27,43]\", \"(21.1,27]\", \"(27,43]\", \"(27,43]\", \"…\n#> $ geo_value_factor_fl 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,…\n#> $ geo_value_factor_nj 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,…\n#> $ geo_value_factor_ny 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…\n#> $ geo_value_factor_tx 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,…\n#> $ lag_0_case_rate -24.55902, -20.30603, -24.17052, -24.98241, -23…\n#> $ lag_7_case_rate -24.28505, -17.44078, -23.74271, -24.00795, -19…\n#> $ lag_14_case_rate -24.61817, -20.99358, -24.55491, -23.72352, -22…\n#> $ lag_0_death_rate -0.2444974, -0.0912512, -0.2111074, -0.2264647,…\n#> $ lag_7_death_rate -0.1875259, -0.0978243, -0.1869826, -0.2035624,…\n#> $ lag_14_death_rate -0.1980493, -0.1431793, -0.1532078, -0.1651456,…\n#> $ ahead_7_death_rate 0.1037824, 0.1426382, 0.0964993, 0.0347229, 0.0…\n```\n:::\n:::\n\n\nBefore directly predicting the results, we need to add postprocessing layers to\nobtain the death counts instead of death rates. Note that the rates used so\nfar are \"per 100K people\" rather than \"per person\". We'll also use quantile\nregression with the `quantile_reg` engine rather than ordinary least squares\nto create median predictions and a 90% prediction interval.\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-13_ddf56bb37ad1c3b42127e4577dcf6985'}\n\n```{.r .cell-code}\nf <- frosting() %>%\n layer_predict() %>%\n layer_add_target_date(\"2022-01-07\") %>%\n layer_add_forecast_date() %>%\n layer_threshold(.pred, lower = 0) %>%\n layer_quantile_distn() %>%\n layer_point_from_distn() %>%\n layer_naomit(.pred) %>%\n layer_population_scaling(\n contains(\".pred\"),\n df = pop_dat,\n rate_rescaling = 1e5,\n by = c(\"geo_value\" = \"abbr\"),\n df_pop_col = \"pop\"\n )\n\nwf <- epi_workflow(r, quantile_reg(quantile_levels = c(.05, .5, .95))) %>%\n fit(jhu) %>%\n add_frosting(f)\n\nlatest <- get_test_data(recipe = r, x = jhu)\np <- predict(wf, latest) %>%\n select(-time_value) %>%\n as_tibble()\np\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 5 × 7\n#> geo_value .pred target_date forecast_date .pred_distn .pred_scaled\n#> \n#> 1 ca 0.181 2022-01-07 2021-12-31 quantiles(0.18)[2] 71.6\n#> 2 fl 0.348 2022-01-07 2021-12-31 quantiles(0.36)[2] 74.7\n#> 3 nj 0.646 2022-01-07 2021-12-31 quantiles(0.64)[2] 57.4\n#> 4 ny 0.698 2022-01-07 2021-12-31 quantiles(0.69)[2] 136. \n#> 5 tx 0.299 2022-01-07 2021-12-31 quantiles(0.3)[2] 86.8\n#> # ℹ 1 more variable: .pred_distn_scaled \n```\n:::\n:::\n\n\nThe columns marked `*_scaled` (unfortunately, some of these\nare hidden above) \nhave been rescaled to the correct units, in this\ncase `deaths` rather than deaths per 100K people (these remain in `.pred`).\n\nTo look at the prediction intervals:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-14_ff67c86ce4610a15216ab8282b701524'}\n\n```{.r .cell-code}\np %>%\n select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>%\n pivot_quantiles_wider(.pred_distn_scaled)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 5 × 5\n#> geo_value target_date .pred_scaled `0.25` `0.75`\n#> \n#> 1 ca 2022-01-07 71.6 48.8 94.0\n#> 2 fl 2022-01-07 74.7 48.4 104. \n#> 3 nj 2022-01-07 57.4 45.5 68.7\n#> 4 ny 2022-01-07 136. 108. 163. \n#> 5 tx 2022-01-07 86.8 68.6 107.\n```\n:::\n:::\n\n\n\nLast but not least, let's take a look at the regression fit and check the \ncoefficients:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-15_ba04dff1d70f15ab185e944bdd928a86'}\n::: {.cell-output .cell-output-stdout}\n```\n#> Call:\n#> quantreg::rq(formula = ..y ~ ., tau = ~c(0.05, 0.5, 0.95), data = data, \n#> na.action = stats::na.omit, method = \"br\", model = FALSE)\n#> \n#> Coefficients:\n#> tau= 0.05 tau= 0.50 tau= 0.95\n#> (Intercept) 0.210811625 0.2962574475 0.417583265\n#> geo_value_factor_fl 0.032085820 0.0482361119 0.171126713\n#> geo_value_factor_nj 0.007313762 -0.0033797953 -0.025251865\n#> geo_value_factor_ny -0.001489163 -0.0199485947 -0.032635584\n#> geo_value_factor_tx 0.029077485 0.0391980273 0.071961515\n#> lag_0_case_rate -0.001636588 -0.0011625693 -0.001430622\n#> lag_7_case_rate 0.004700752 0.0057822095 0.006912655\n#> lag_14_case_rate 0.001715816 0.0004224753 0.003448733\n#> lag_0_death_rate 0.462341754 0.5274192012 0.164856372\n#> lag_7_death_rate -0.007368501 0.1132903956 0.172687438\n#> lag_14_death_rate -0.072500707 -0.0270474349 0.181279299\n#> \n#> Degrees of freedom: 950 total; 939 residual\n```\n:::\n:::\n\n\n## Classification\n\nSometimes it is preferable to create a predictive model for surges or upswings\nrather than for raw values. In this case,\nthe target is to predict if the future will have increased case rates (denoted `up`),\ndecreased case rates (`down`), or flat case rates (`flat`) relative to the current\nlevel. Such models may be \nreferred to as \"hotspot prediction models\". We will follow the analysis \nin [McDonald, Bien, Green, Hu, et al.](#references) but extend the application\nto predict three categories instead of two. \n\nHotspot prediction uses a categorical outcome variable defined in terms of the \nrelative change of $Y_{\\ell, t+a}$ compared to $Y_{\\ell, t}$. \nWhere $Y_{\\ell, t}$ denotes the case rates in location $\\ell$ at time $t$. \nWe define the response variables as follows:\n\n$$\n Z_{\\ell, t}=\n \\begin{cases}\n \\text{up}, & \\text{if}\\ Y^{\\Delta}_{\\ell, t} > 0.25 \\\\ \n \\text{down}, & \\text{if}\\ Y^{\\Delta}_{\\ell, t} < -0.20\\\\\n \\text{flat}, & \\text{otherwise}\n \\end{cases}\n$$\n\nwhere $Y^{\\Delta}_{\\ell, t} = (Y_{\\ell, t}- Y_{\\ell, t-7})\\ /\\ (Y_{\\ell, t-7})$. \nWe say location $\\ell$ is a hotspot at time $t$ when $Z_{\\ell,t}$ is \n`up`, meaning the number of newly reported cases over the past 7 days has \nincreased by at least 25% compared to the preceding week. When $Z_{\\ell,t}$ \nis categorized as `down`, it suggests that there has been at least a 20% \ndecrease in newly reported cases over the past 7 days (a 20% decrease is the inverse of a 25% increase). Otherwise, we will \nconsider the trend to be `flat`. \n\nThe expression of the multinomial regression we will use is as follows:\n$$\n\\pi_{j}(x) = \\text{Pr}(Z_{\\ell,t} = j|x) = \\frac{e^{g_j(x)}}{1 + \\sum_{k=0}^2 g_j(x) }\n$$\nwhere $j$ is either down, flat, or up\n\n$$\n\\begin{aligned}\ng_{\\text{down}}(x) &= 0,\\\\\ng_{\\text{flat}}(x) &= \n\\log\\left(\\frac{Pr(Z_{\\ell,t}=\\text{flat}|x)}{Pr(Z_{\\ell,t}=\\text{down}|x)}\\right) = \n\\beta_{10} + \\beta_{11}t + \\delta_{10} s_{\\text{state}_1} +\n\\delta_{11} s_{\\text{state}_2} + \\cdots \\nonumber \\\\\n&\\quad +\\ \\beta_{12} Y^{\\Delta}_{\\ell, t} +\n\\beta_{13} Y^{\\Delta}_{\\ell, t-7}, \\\\\ng_{\\text{flat}}(x) &= \\log\\left(\\frac{Pr(Z_{\\ell,t}=\\text{up}|x)}{Pr(Z_{\\ell,t}=\\text{down}|x)}\\right) = \n\\beta_{20} + \\beta_{21}t + \\delta_{20} s_{\\text{state}_1} +\n\\delta_{21} s_{\\text{state}_2} + \\cdots \\nonumber \\\\\n&\\quad +\\ \\beta_{22} Y^{\\Delta}_{\\ell, t} +\n\\beta_{23} Y^{\\Delta}_{\\ell, t-7}.\n\\end{aligned}\n$$\n\n\nPreprocessing steps are similar to the previous models with an additional step \nof categorizing the response variables. Again, we will use a subset of death rate and case rate data from our built-in dataset \n`case_death_rate_subset`.\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-16_6c59ca34a0fd30f1f204b03181f28c88'}\n\n```{.r .cell-code}\njhu_rates <- case_death_rate_subset %>%\n dplyr::filter(\n time_value >= \"2021-06-04\",\n time_value <= \"2021-12-31\",\n geo_value %in% c(\"ca\", \"fl\", \"tx\", \"ny\", \"nj\")\n ) %>%\n mutate(geo_value_factor = as.factor(geo_value))\n\nr <- epi_recipe(jhu_rates) %>%\n add_role(time_value, new_role = \"predictor\") %>%\n step_dummy(geo_value_factor) %>%\n step_growth_rate(case_rate, role = \"none\", prefix = \"gr_\") %>%\n step_epi_lag(starts_with(\"gr_\"), lag = c(0, 7, 14)) %>%\n step_epi_ahead(starts_with(\"gr_\"), ahead = 7, role = \"none\") %>%\n # note recipes::step_cut() has a bug in it, or we could use that here\n step_mutate(\n response = cut(\n ahead_7_gr_7_rel_change_case_rate,\n breaks = c(-Inf, -0.2, 0.25, Inf) / 7, # division gives weekly not daily\n labels = c(\"down\", \"flat\", \"up\")\n ),\n role = \"outcome\"\n ) %>%\n step_rm(has_role(\"none\"), has_role(\"raw\")) %>%\n step_epi_naomit()\n```\n:::\n\n\nWe will fit the multinomial regression and examine the predictions:\n\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-17_f55e79fcffe78515bd0042409ccfa0bc'}\n\n```{.r .cell-code}\nwf <- epi_workflow(r, parsnip::multinom_reg()) %>%\n fit(jhu_rates)\n\nlatest <- get_test_data(recipe = r, x = jhu_rates)\npredict(wf, latest) %>% filter(!is.na(.pred_class))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 5 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25\n#> \n#> # A tibble: 5 × 3\n#> geo_value time_value .pred_class\n#> * \n#> 1 ca 2021-12-31 up \n#> 2 fl 2021-12-31 up \n#> 3 nj 2021-12-31 up \n#> 4 ny 2021-12-31 up \n#> 5 tx 2021-12-31 up\n```\n:::\n:::\n\n\nWe can also look at the estimated coefficients and model summary information:\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-18_118e02a282350ab72791be1f72b553d2'}\n\n```{.r .cell-code}\nextract_fit_engine(wf)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Call:\n#> nnet::multinom(formula = ..y ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) time_value geo_value_factor_fl geo_value_factor_nj\n#> flat -144.2225 0.007754541 -1.3251323 1.137559\n#> up -133.1994 0.007082196 -0.5081303 1.562700\n#> geo_value_factor_ny geo_value_factor_tx lag_0_gr_7_rel_change_case_rate\n#> flat 24.74419 -0.3345776 18.96354\n#> up 24.84975 -0.3176996 33.79518\n#> lag_7_gr_7_rel_change_case_rate lag_14_gr_7_rel_change_case_rate\n#> flat 33.19049 7.157042\n#> up 56.52374 4.684437\n#> \n#> Residual Deviance: 1157.928 \n#> AIC: 1193.928\n```\n:::\n:::\n\n\nOne could also use a formula in `epi_recipe()` to achieve the same results as \nabove. However, only one of `add_formula()`, `add_recipe()`, or \n`workflow_variables()` can be specified. For the purpose of demonstrating \n`add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to\nreturn a `data.frame` that could be used for model fitting.\n\n::: {.cell layout-align=\"center\" hash='preprocessing-and-models_cache/html/unnamed-chunk-19_95018fa1894b856edd76e784a2756aa6'}\n\n```{.r .cell-code}\nb <- bake(prep(r, jhu_rates), jhu_rates)\n\nepi_workflow() %>%\n add_formula(\n response ~ geo_value + time_value + lag_0_gr_7_rel_change_case_rate +\n lag_7_gr_7_rel_change_case_rate + lag_14_gr_7_rel_change_case_rate\n ) %>%\n add_model(parsnip::multinom_reg()) %>%\n fit(data = b)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Formula\n#> Model: multinom_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> response ~ geo_value + time_value + lag_0_gr_7_rel_change_case_rate + \n#> lag_7_gr_7_rel_change_case_rate + lag_14_gr_7_rel_change_case_rate\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> Call:\n#> nnet::multinom(formula = ..y ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) geo_valuefl geo_valuenj geo_valueny geo_valuetx time_value\n#> flat -144.2169 -1.3265549 1.133934 24.75059 -0.3335115 0.007754345\n#> up -133.3502 -0.5120186 1.559702 24.85665 -0.3158343 0.007090249\n#> lag_0_gr_7_rel_change_case_rate lag_7_gr_7_rel_change_case_rate\n#> flat 19.02252 33.20794\n#> up 33.84660 56.57061\n#> lag_14_gr_7_rel_change_case_rate\n#> flat 7.140372\n#> up 4.668915\n#> \n#> Residual Deviance: 1157.919 \n#> AIC: 1193.919\n```\n:::\n:::\n\n\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/slide/execute-results/html.json b/_freeze/slide/execute-results/html.json index 95aae08..70b9578 100644 --- a/_freeze/slide/execute-results/html.json +++ b/_freeze/slide/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "b19c5c683d95cba2118e9f01196bee02", + "hash": "74d3db577831e2a6d86c6a6c2939a7d3", "result": { - "markdown": "# Sliding computations {#sec-sliding}\n\nA central tool in the `{epiprocess}` package is `epi_slide()`, which is based\non the powerful functionality provided in the \n[`slider`](https://cran.r-project.org/web/packages/slider) package. In\n`{epiprocess}`, to \"slide\" means to apply a computation---represented as a\nfunction or formula---over a sliding/rolling data window. Suitable\ngroupings can always be achieved by a preliminary call to `group_by()`.\n\nBy default, the meaning of one time step is inferred from the `time_value`\ncolumn of the `epi_df` object under consideration, based on the way this column\nunderstands addition and subtraction. For example, if the time values are coded\nas `Date` objects, then one time step is one day, since \n`as.Date(\"2022-01-01\") + 1` equals `as.Date(\"2022-01-02\")`. Alternatively, the time step can be specified\nmanually in the call to `epi_slide()`; you can read the documentation for more\ndetails. Furthermore, the alignment of the running window used in `epi_slide()`\ncan be \"right\", \"center\", or \"left\"; the default is \"right\", and is what we use\nin this vignette.\n\nAs in getting started guide, we'll fetch daily reported COVID-19 cases from CA,\nFL, NY, and TX (note: here we're using new, not cumulative cases) using the\n[`epidatr`](https://github.com/cmu-delphi/epidatr) package,\nand then convert this to `epi_df` format.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-2_feb3ab09af2a656b7552aabd4fb92768'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(epipredict)\n```\n:::\n\n\nThe example data we'll use is part of the package and has 2,684 rows and 3 columns.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-3_de5ebab547ecc5d1e32e4f6b65aac60b'}\n\n```{.r .cell-code}\ndata(jhu_csse_daily_subset)\nx <- jhu_csse_daily_subset %>%\n select(geo_value, time_value, cases) %>%\n arrange(geo_value, time_value) %>%\n as_epi_df()\n```\n:::\n\n\n\n## Slide with a formula\n\nWe first demonstrate how to apply a 7-day trailing average to the daily cases in\norder to smooth the signal, by passing in a formula for the first argument of\n`epi_slide()`. To do this computation per state, we first call `group_by()`.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-4_13b28969f566d77bd0c5e1e88a551491'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(~ mean(.x$cases), before = 6) %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07.044666\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases slide_value\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\nThe formula specified has access to all non-grouping columns present in the\noriginal `epi_df` object (and must refer to them with the prefix `.x$`). As we\ncan see, the function `epi_slide()` returns an `epi_df` object with a new column\nappended that contains the results (from sliding), named `slide_value` as the\ndefault. We can of course change this post hoc, or we can instead specify a new\nname up front using the `new_col_name` argument:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-5_cf02a2675d6bbdf3eb316e16406a82e5'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(~ mean(.x$cases), before = 6, new_col_name = \"cases_7dav\") %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07.044666\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases cases_7dav\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\nSome other information is available in additional variables:\n\n* `.group_key` is a one-row tibble containing the values of the grouping\n variables for the associated group\n* `.ref_time_value` is the reference time value the time window was based on\n\nLike in `group_modify()`, there are alternative names for these variables as\nwell: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z`\ninstead of `.ref_time_value`.\n\n## Slide with a function \n\nWe can also pass a function for the first argument in `epi_slide()`. In this\ncase, the passed function must accept the following arguments:\n\nIn this case, the passed function `f` must accept the following arguments: a\ndata frame with the same column names as the original object, minus any grouping\nvariables, containing the time window data for one group-`ref_time_value`\ncombination; followed by a one-row tibble containing the values of the grouping\nvariables for the associated group; followed by the associated `ref_time_value`.\nIt can accept additional arguments; `epi_slide()` will forward any `...` args it\nreceives to `f`.\n\nRecreating the last example of a 7-day trailing average:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-6_63c4174606b3c7249ee9ddd5f3171d78'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(function(x, gk, rtv) mean(x$cases),\n before = 6, new_col_name = \"cases_7dav\"\n ) %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07.044666\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases cases_7dav\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\n## Slide the tidy way\n\nPerhaps the most convenient way to setup a computation in `epi_slide()` is to\npass in an expression for tidy evaluation. In this case, we can simply define\nthe name of the new column directly as part of the expression, setting it equal\nto a computation in which we can access any columns of `x` by name, just as we\nwould in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-7_86937bdc4f9b436be5721bf89cb48542'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n epi_slide(cases_7dav = mean(cases), before = 6) %>%\n ungroup()\n```\n:::\n\nIn addition to referring to individual columns by name, you can refer to the\ntime window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse \"pronouns\" `.data` and `.env` can also be used.\n\nAs a simple sanity check, we visualize the 7-day trailing averages computed on\ntop of the original counts.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-8_4be7d7ffd8b84de93dbeff6c68bf1113'}\n\n```{.r .cell-code code-fold=\"true\"}\ncols <- RColorBrewer::brewer.pal(7, \"Set1\")[-6]\nggplot(x, aes(x = time_value)) +\n geom_col(aes(y = cases, fill = geo_value),\n alpha = 0.5,\n show.legend = FALSE\n ) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) +\n scale_fill_manual(values = cols) +\n scale_color_manual(values = cols) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAs we can see from the center top panel, it looks like Florida moved to weekly \nreporting of COVID-19 cases in summer of 2021, while California occasionally reported negative cases counts!\n\n## Running a local forecaster {#sec-local-forecaster}\n\nAs a more complex example, we preview some of the functionality of `{epipredict}` described in future chapters, and use a forecaster based on a\nlocal (in time)\nautoregression or \"AR model\". AR models can be fit in numerous ways \n(using base R\nfunctions and various packages), but here we the `arx_forecaster()`, implemented in `{epipredict}` both\nprovides a more advanced example of sliding a function over an `epi_df` object,\nand it allows us to be a bit more flexible in defining a *probabilistic*\nforecaster: one that outputs not just a point prediction, but a notion of\nuncertainty around this. In particular, our forecaster will output a point\nprediction along with an 90\\% uncertainty band, represented by a predictive\nquantiles at the 5\\% and 95\\% levels (lower and upper endpoints of the\nuncertainty band).\n\nThe function signature below, is a probabilistic AR forecaster. The\n`lags` argument indicates which lags to use in the model, and `ahead` indicates\nhow far ahead in the future to make forecasts (both are encoded in terms of the\nunits of the `time_value` column; so, days, in the working `epi_df` being\nconsidered in this vignette).\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-9_bf2bf3063005fa308698ea53fdd30f5f'}\n\n```{.r .cell-code}\narx_forecaster <- function(\n epi_df, \n outcome, # the outcome column name in `epi_df`\n predictors, # a character vector, containing 1 or more predictors in `epi_df`\n trainer = quantile_reg(), \n args_list = arx_args_list(\n lags = c(0, 7, 14), \n ahead = 7,\n levels = c(0.05, 0.95)\n )\n)\n```\n:::\n\n\nWe go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 \ncases. Note that we actually model the `cases_7dav` column, to operate on the \nscale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant,\nto modeling weekly sums of COVID-19 cases.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-10_4ecf05c582d8dec6a2a1625d5edcaeda'}\n\n```{.r .cell-code}\nfc_time_values <- seq(\n from = as.Date(\"2020-06-01\"),\n to = as.Date(\"2021-12-01\"),\n by = \"1 months\"\n)\n\nfcasts <- epi_slide(\n x,\n ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = 7)\n )$predictions,\n before = 119,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n)\n\n# grab just the relevant columns, and make them easier to plot\nfcasts <- fcasts %>%\n select(\n geo_value, time_value, cases_7dav,\n contains(\"_distn\"), fc_target_date\n ) %>%\n pivot_quantiles(contains(\"_distn\"))\nfcasts\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 114 × 7\n#> geo_value time_value cases_7dav fc_target_date `0.05` `0.5` `0.95`\n#> \n#> 1 ca 2020-06-01 2655. 2020-06-08 1940. 2694. 3840.\n#> 2 fl 2020-06-01 726. 2020-06-08 558. 747. 1290.\n#> 3 ga 2020-06-01 643. 2020-06-08 520. 638. 1083.\n#> 4 ny 2020-06-01 1278. 2020-06-08 821. 1044. 1864.\n#> 5 pa 2020-06-01 603. 2020-06-08 450. 570. 1080.\n#> 6 tx 2020-06-01 1002. 2020-06-08 716. 1134. 1950.\n#> # ℹ 108 more rows\n```\n:::\n:::\n\n\nNote that here we have used an argument `ref_time_values` to perform the\nsliding computation (here, compute a forecast) at a specific subset of reference\ntime values. We get out 4 new columns: `fc_target_date`, `0.05`, `0.5`, `0.95`\nthat correspond to the date the forecast is for (rather than the date it was made on, the point forecast, and the lower and upper endpoints of the\n95\\% prediction band.[^1]\n\n[^1]: If instead we had set `as_list_col = TRUE`\nin the call to `epi_slide()`, then we would have gotten a list column `fc`, \nwhere each element of `fc` contains these results.\n\nTo finish off, we plot the forecasts at some times (spaced out by a few months)\nover the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do \nso, we encapsulate the process of generating forecasts into a simple function, \nso that we can call it a few times.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-11_7f504b8f9472dc792d7a362b1998339e'}\n\n```{.r .cell-code}\nk_week_ahead <- function(ahead = 7) {\n epi_slide(\n x,\n ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 119,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(\n geo_value, time_value, cases_7dav, contains(\"_distn\"),\n fc_target_date\n ) %>%\n pivot_quantiles(contains(\"_distn\"))\n}\n\n# First generate the forecasts, and bind them together\nz <- map(c(7, 14, 21, 28), k_week_ahead) %>% list_rbind()\n```\n:::\n\n\nThen we can plot the on top of the observed data\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-12_f17b1e21df0fa2849ed240533f7e168f'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(z) +\n geom_line(data = x, aes(x = time_value, y = cases_7dav), color = \"gray50\") +\n geom_ribbon(aes(\n x = fc_target_date, ymin = `0.05`, ymax = `0.95`,\n group = time_value, fill = geo_value\n ), alpha = 0.4) +\n geom_line(aes(x = fc_target_date, y = `0.5`, group = time_value)) +\n geom_point(aes(x = fc_target_date, y = `0.5`, group = time_value), size = 0.5) +\n # geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x),\n # linetype = 2, alpha = 0.5) +\n facet_wrap(vars(geo_value), scales = \"free_y\", nrow = 3) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"1 months\", date_labels = \"%b %Y\") +\n scale_fill_viridis_d(guide = \"none\", end = .9) +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-12-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nTwo points are worth making. First, the AR model's performance here is pretty\nspotty. At various points in time, we can see that its forecasts are volatile\n(its point predictions are all over the place), or overconfident (its bands are\ntoo narrow), or both at the same time. This is only meant as a simple demo and\nnot entirely unexpected given the way the AR model is set up. The\n[`epipredict`](https://cmu-delphi.github.io/epipredict) package, \noffers a suite of predictive modeling tools \nthat improve on many of the shortcomings of the above simple AR model (simply \nusing all states for training rather than 6 is a huge improvement).\n\nSecond, the AR forecaster here is using finalized data, meaning, it uses the\nlatest versions of signal values (reported COVID-19 cases) available, for both\ntraining models and making predictions historically. However, this is not\nreflective of the provisional nature of the data that it must cope with in a\ntrue forecast task. Training and making predictions on finalized data can lead\nto an overly optimistic sense of accuracy; see, for example, \n[@McDonaldBien2021] and references\ntherein. Fortunately, the `epiprocess` package provides a data structure called\n`epi_archive` that can be used to store all data revisions, and furthermore, an\n`epi_archive` object knows how to slide computations in the correct\nversion-aware sense (for the computation at each reference time $t$, it uses\nonly data that would have been available as of $t$). We will revisit this \nexample in the [archive \nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html).\n", + "markdown": "# Sliding computations {#sec-sliding}\n\nA central tool in the `{epiprocess}` package is `epi_slide()`, which is based\non the powerful functionality provided in the \n[`slider`](https://cran.r-project.org/web/packages/slider) package. In\n`{epiprocess}`, to \"slide\" means to apply a computation---represented as a\nfunction or formula---over a sliding/rolling data window. Suitable\ngroupings can always be achieved by a preliminary call to `group_by()`.\n\nBy default, the meaning of one time step is inferred from the `time_value`\ncolumn of the `epi_df` object under consideration, based on the way this column\nunderstands addition and subtraction. For example, if the time values are coded\nas `Date` objects, then one time step is one day, since \n`as.Date(\"2022-01-01\") + 1` equals `as.Date(\"2022-01-02\")`. Alternatively, the time step can be specified\nmanually in the call to `epi_slide()`; you can read the documentation for more\ndetails. Furthermore, the alignment of the running window used in `epi_slide()`\ncan be \"right\", \"center\", or \"left\"; the default is \"right\", and is what we use\nin this vignette.\n\nAs in getting started guide, we'll fetch daily reported COVID-19 cases from CA,\nFL, NY, and TX (note: here we're using new, not cumulative cases) using the\n[`epidatr`](https://github.com/cmu-delphi/epidatr) package,\nand then convert this to `epi_df` format.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-2_feb3ab09af2a656b7552aabd4fb92768'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(epipredict)\n```\n:::\n\n\nThe example data we'll use is part of the package and has 2,684 rows and 3 columns.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-3_de5ebab547ecc5d1e32e4f6b65aac60b'}\n\n```{.r .cell-code}\ndata(jhu_csse_daily_subset)\nx <- jhu_csse_daily_subset %>%\n select(geo_value, time_value, cases) %>%\n arrange(geo_value, time_value) %>%\n as_epi_df()\n```\n:::\n\n\n\n## Slide with a formula\n\nWe first demonstrate how to apply a 7-day trailing average to the daily cases in\norder to smooth the signal, by passing in a formula for the first argument of\n`epi_slide()`. To do this computation per state, we first call `group_by()`.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-4_13b28969f566d77bd0c5e1e88a551491'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(~ mean(.x$cases), before = 6) %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases slide_value\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\nThe formula specified has access to all non-grouping columns present in the\noriginal `epi_df` object (and must refer to them with the prefix `.x$`). As we\ncan see, the function `epi_slide()` returns an `epi_df` object with a new column\nappended that contains the results (from sliding), named `slide_value` as the\ndefault. We can of course change this post hoc, or we can instead specify a new\nname up front using the `new_col_name` argument:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-5_cf02a2675d6bbdf3eb316e16406a82e5'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(~ mean(.x$cases), before = 6, new_col_name = \"cases_7dav\") %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases cases_7dav\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\nSome other information is available in additional variables:\n\n* `.group_key` is a one-row tibble containing the values of the grouping\n variables for the associated group\n* `.ref_time_value` is the reference time value the time window was based on\n\nLike in `group_modify()`, there are alternative names for these variables as\nwell: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z`\ninstead of `.ref_time_value`.\n\n## Slide with a function \n\nWe can also pass a function for the first argument in `epi_slide()`. In this\ncase, the passed function must accept the following arguments:\n\nIn this case, the passed function `f` must accept the following arguments: a\ndata frame with the same column names as the original object, minus any grouping\nvariables, containing the time window data for one group-`ref_time_value`\ncombination; followed by a one-row tibble containing the values of the grouping\nvariables for the associated group; followed by the associated `ref_time_value`.\nIt can accept additional arguments; `epi_slide()` will forward any `...` args it\nreceives to `f`.\n\nRecreating the last example of a 7-day trailing average:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-6_63c4174606b3c7249ee9ddd5f3171d78'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(function(x, gk, rtv) mean(x$cases),\n before = 6, new_col_name = \"cases_7dav\"\n ) %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases cases_7dav\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\n## Slide the tidy way\n\nPerhaps the most convenient way to setup a computation in `epi_slide()` is to\npass in an expression for tidy evaluation. In this case, we can simply define\nthe name of the new column directly as part of the expression, setting it equal\nto a computation in which we can access any columns of `x` by name, just as we\nwould in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-7_86937bdc4f9b436be5721bf89cb48542'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n epi_slide(cases_7dav = mean(cases), before = 6) %>%\n ungroup()\n```\n:::\n\nIn addition to referring to individual columns by name, you can refer to the\ntime window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse \"pronouns\" `.data` and `.env` can also be used.\n\nAs a simple sanity check, we visualize the 7-day trailing averages computed on\ntop of the original counts.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-8_4be7d7ffd8b84de93dbeff6c68bf1113'}\n\n```{.r .cell-code code-fold=\"true\"}\ncols <- RColorBrewer::brewer.pal(7, \"Set1\")[-6]\nggplot(x, aes(x = time_value)) +\n geom_col(aes(y = cases, fill = geo_value),\n alpha = 0.5,\n show.legend = FALSE\n ) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) +\n scale_fill_manual(values = cols) +\n scale_color_manual(values = cols) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAs we can see from the center top panel, it looks like Florida moved to weekly \nreporting of COVID-19 cases in summer of 2021, while California occasionally reported negative cases counts!\n\n## Running a local forecaster {#sec-local-forecaster}\n\nAs a more complex example, we preview some of the functionality of `{epipredict}` described in future chapters, and use a forecaster based on a\nlocal (in time)\nautoregression or \"AR model\". AR models can be fit in numerous ways \n(using base R\nfunctions and various packages), but here we the `arx_forecaster()`, implemented in `{epipredict}` both\nprovides a more advanced example of sliding a function over an `epi_df` object,\nand it allows us to be a bit more flexible in defining a *probabilistic*\nforecaster: one that outputs not just a point prediction, but a notion of\nuncertainty around this. In particular, our forecaster will output a point\nprediction along with an 90\\% uncertainty band, represented by a predictive\nquantiles at the 5\\% and 95\\% levels (lower and upper endpoints of the\nuncertainty band).\n\nThe function signature below, is a probabilistic AR forecaster. The\n`lags` argument indicates which lags to use in the model, and `ahead` indicates\nhow far ahead in the future to make forecasts (both are encoded in terms of the\nunits of the `time_value` column; so, days, in the working `epi_df` being\nconsidered in this vignette).\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-9_079e5420d9e5d2f5501eb74de8b45cb6'}\n\n```{.r .cell-code}\narx_forecaster <- function(\n epi_df, \n outcome, # the outcome column name in `epi_df`\n predictors, # a character vector, containing 1 or more predictors in `epi_df`\n trainer = quantile_reg(), \n args_list = arx_args_list(\n lags = c(0, 7, 14), \n ahead = 7,\n quantile_levels = c(0.05, 0.95)\n )\n)\n```\n:::\n\n\nWe go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 \ncases. Note that we actually model the `cases_7dav` column, to operate on the \nscale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant,\nto modeling weekly sums of COVID-19 cases.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-10_9e2bba94dc13dd185ad30b365f0a4eb4'}\n\n```{.r .cell-code}\nfc_time_values <- seq(\n from = as.Date(\"2020-06-01\"),\n to = as.Date(\"2021-12-01\"),\n by = \"1 months\"\n)\n\nfcasts <- epi_slide(\n x,\n ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = 7)\n )$predictions,\n before = 119,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n)\n\n# grab just the relevant columns, and make them easier to plot\nfcasts <- fcasts %>%\n select(\n geo_value, time_value, cases_7dav,\n contains(\"_distn\"), fc_target_date\n ) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\nfcasts\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 114 × 7\n#> geo_value time_value cases_7dav fc_target_date `0.05` `0.5` `0.95`\n#> \n#> 1 ca 2020-06-01 2655. 2020-06-08 1940. 2694. 3840.\n#> 2 fl 2020-06-01 726. 2020-06-08 558. 747. 1290.\n#> 3 ga 2020-06-01 643. 2020-06-08 520. 638. 1083.\n#> 4 ny 2020-06-01 1278. 2020-06-08 821. 1044. 1864.\n#> 5 pa 2020-06-01 603. 2020-06-08 450. 570. 1080.\n#> 6 tx 2020-06-01 1002. 2020-06-08 716. 1134. 1950.\n#> # ℹ 108 more rows\n```\n:::\n:::\n\n\nNote that here we have used an argument `ref_time_values` to perform the\nsliding computation (here, compute a forecast) at a specific subset of reference\ntime values. We get out 4 new columns: `fc_target_date`, `0.05`, `0.5`, `0.95`\nthat correspond to the date the forecast is for (rather than the date it was made on, the point forecast, and the lower and upper endpoints of the\n95\\% prediction band.[^1]\n\n[^1]: If instead we had set `as_list_col = TRUE`\nin the call to `epi_slide()`, then we would have gotten a list column `fc`, \nwhere each element of `fc` contains these results.\n\nTo finish off, we plot the forecasts at some times (spaced out by a few months)\nover the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do \nso, we encapsulate the process of generating forecasts into a simple function, \nso that we can call it a few times.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-11_d30fdf1ff99b2e470d215f81656d5b01'}\n\n```{.r .cell-code}\nk_week_ahead <- function(ahead = 7) {\n epi_slide(\n x,\n ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 119,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(\n geo_value, time_value, cases_7dav, contains(\"_distn\"),\n fc_target_date\n ) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n}\n\n# First generate the forecasts, and bind them together\nz <- map(c(7, 14, 21, 28), k_week_ahead) %>% list_rbind()\n```\n:::\n\n\nThen we can plot the on top of the observed data\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-12_f17b1e21df0fa2849ed240533f7e168f'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(z) +\n geom_line(data = x, aes(x = time_value, y = cases_7dav), color = \"gray50\") +\n geom_ribbon(aes(\n x = fc_target_date, ymin = `0.05`, ymax = `0.95`,\n group = time_value, fill = geo_value\n ), alpha = 0.4) +\n geom_line(aes(x = fc_target_date, y = `0.5`, group = time_value)) +\n geom_point(aes(x = fc_target_date, y = `0.5`, group = time_value), size = 0.5) +\n # geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x),\n # linetype = 2, alpha = 0.5) +\n facet_wrap(vars(geo_value), scales = \"free_y\", nrow = 3) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"1 months\", date_labels = \"%b %Y\") +\n scale_fill_viridis_d(guide = \"none\", end = .9) +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-12-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nTwo points are worth making. First, the AR model's performance here is pretty\nspotty. At various points in time, we can see that its forecasts are volatile\n(its point predictions are all over the place), or overconfident (its bands are\ntoo narrow), or both at the same time. This is only meant as a simple demo and\nnot entirely unexpected given the way the AR model is set up. The\n[`epipredict`](https://cmu-delphi.github.io/epipredict) package, \noffers a suite of predictive modeling tools \nthat improve on many of the shortcomings of the above simple AR model (simply \nusing all states for training rather than 6 is a huge improvement).\n\nSecond, the AR forecaster here is using finalized data, meaning, it uses the\nlatest versions of signal values (reported COVID-19 cases) available, for both\ntraining models and making predictions historically. However, this is not\nreflective of the provisional nature of the data that it must cope with in a\ntrue forecast task. Training and making predictions on finalized data can lead\nto an overly optimistic sense of accuracy; see, for example, \n[@McDonaldBien2021] and references\ntherein. Fortunately, the `epiprocess` package provides a data structure called\n`epi_archive` that can be used to store all data revisions, and furthermore, an\n`epi_archive` object knows how to slide computations in the correct\nversion-aware sense (for the computation at each reference time $t$, it uses\nonly data that would have been available as of $t$). We will revisit this \nexample in the [archive \nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html).\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/sliding-forecasters/execute-results/html.json b/_freeze/sliding-forecasters/execute-results/html.json index 6e015e5..4c57753 100644 --- a/_freeze/sliding-forecasters/execute-results/html.json +++ b/_freeze/sliding-forecasters/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "8514cd1bbd08dffa71706d86999d5a77", + "hash": "fe90988fde3d6bdd6237da37e0a6583d", "result": { - "markdown": "# Pseudo-prospective forecast inspection\n\n\n::: {.cell}\n\n:::\n\n\n\nA key function from the epiprocess package is `epi_slide()`, which allows the\nuser to apply a function or formula-based computation over variables in an\n`epi_df` over a running window of `n` time steps (see the following `epiprocess`\nvignette to go over the basics of the function: [\"Slide a computation over\nsignal values\"](https://cmu-delphi.github.io/epiprocess/articles/slide.html)).\nThe equivalent sliding method for an `epi_archive` object can be called by using\nthe wrapper function `epix_slide()` (refer to the following vignette for the\nbasics of the function: [\"Work with archive objects and data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)). The\nkey difference from `epi_slide()` is that it performs version-aware\ncomputations. That is, the function only uses data that would have been\navailable as of time t for that reference time.\n\nIn this vignette, we use `epi_slide()` and `epix_slide()` for backtesting our\n`arx_forecaster` on historical COVID-19 case data from the US and from Canada.\nMore precisely, we first demonstrate using `epi_slide()` to slide ARX\nforecasters over an `epi_df` object and compare the results obtained from using\ndifferent forecasting engines. We then compare these simple retrospective\nforecasts to more proper \"pseudoprospective\" forecasts generated using snapshots\nof the data that was available in real time, using `epix_slide()`.\n\n## Comparing different forecasting engines\n\n### Example using CLI and case data from US states \n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_d4d80a61c31b62ea1a5d61f1072177bf'}\n\n```{.r .cell-code}\nus_raw_history_dfs <-\n readRDS(system.file(\"extdata\", \"all_states_covidcast_signals.rds\",\n package = \"epipredict\", mustWork = TRUE\n ))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nAfter obtaining the latest snapshot of the data, we produce forecasts on that\ndata using the default engine of simple linear regression and compare to a\nrandom forest.\n\nNote that all of the warnings about the forecast date being less than the most\nrecent update date of the data have been suppressed to avoid cluttering the\noutput.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_e94760e6e189c78095774ec6e5d8dd64'}\n\n```{.r .cell-code}\n# Latest snapshot of data, and forecast dates\nus_latest <- epix_as_of(us_archive, max_version = max(us_archive$versions_end))\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nk_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) {\n epi_slide(epi_df, ~ arx_forecaster(\n .x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n select(-geo_value),\n before = 120L - 1L,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(geo_value, time_value, starts_with(\"fc\")) %>%\n mutate(engine_type = engine$engine)\n}\n\n# Generate the forecasts and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = rand_forest(mode = \"regression\")\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles(contains(\"_distn\"))\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands. \n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states, \nwe'll just display the\nresults for two states, California (CA) and Florida (FL), to get a sense of the\nmodel performance while keeping the graphic simple. \n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_cf5ed426dadcf87aa72c873f89ba401b'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not\nresult in such in overconfident predictions (overly narrow confidence bands).\nThough, in general, neither approach produces amazingly accurate forecasts. \nThis could be because\nthe behaviour is rather different across states and the effects of other notable\nfactors such as age and public health measures may be important to account for\nin such forecasting. Including such factors as well as making enhancements such\nas correcting for outliers are some improvements one could make to this simple\nmodel.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to out-perform many far more complicated models in the online Covid forecasting due to those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_e71fa828c8e3e33e0763c1dbbe5bc5ce'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- readRDS(system.file(\n \"extdata\", \"can_prov_cases.rds\",\n package = \"epipredict\", mustWork = TRUE\n))\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version))\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x,\n boost_tree(mode = \"regression\", trees = 20)\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles(contains(\"_distn\"))\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression. \n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_749e70213871f43929436d4a578868fa'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_145622420fe9517007923111890c3146'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Pseudoprospective vs. unfaithful retrospective forecasting\n\n### Example using case data from US states \n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. That is, we can\nmake forecasts on the archive, `us_archive`, and compare those to forecasts on\n(time windows of) the latest data, `us_latest`, using the same general set-up as\nabove. For pseudoprospective forecasting, note that `us_archive` is fed into\n`epix_slide()`, while for simpler (unfaithful) retrospective forecasting,\n`us_latest` is fed into `epi_slide()`. #%% update to include percent_cli after\nthat issue is fixed?\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-ar-kweek-asof_21e1c68d47e5580356a43fdfb3832164'}\n\n```{.r .cell-code}\nk_week_versioning <- function(ahead, version = c(\"faithful\", \"unfaithful\")) {\n version <- match.arg(version)\n if (version == \"faithful\") {\n epix_slide(\n us_archive,\n ~ arx_forecaster(\n .x, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 120 - 1,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n mutate(version = \"version faithful\") %>%\n rename(geo_value = \"fc_geo_value\")\n } else {\n k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n ahead, linear_reg()\n ) %>% mutate(version = \"not version faithful\")\n }\n}\n\n# Generate the forecasts, and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_versioning(.x, \"faithful\")) %>% list_rbind(),\n map(aheads, ~ k_week_versioning(.x, \"unfaithful\")) %>% list_rbind()\n) %>% pivot_quantiles(fc_.pred_distn)\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_c6417eaf4d97855d750b9f8aeb315d67'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(x = fc_target_date, group = time_value)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = version), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n scale_fill_brewer(palette = \"Set1\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", + "markdown": "# Pseudo-prospective forecast inspection\n\n\n::: {.cell}\n\n:::\n\n\n\nA key function from the epiprocess package is `epi_slide()`, which allows the\nuser to apply a function or formula-based computation over variables in an\n`epi_df` over a running window of `n` time steps (see the following `epiprocess`\nvignette to go over the basics of the function: [\"Slide a computation over\nsignal values\"](https://cmu-delphi.github.io/epiprocess/articles/slide.html)).\nThe equivalent sliding method for an `epi_archive` object can be called by using\nthe wrapper function `epix_slide()` (refer to the following vignette for the\nbasics of the function: [\"Work with archive objects and data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)). The\nkey difference from `epi_slide()` is that it performs version-aware\ncomputations. That is, the function only uses data that would have been\navailable as of time t for that reference time.\n\nIn this vignette, we use `epi_slide()` and `epix_slide()` for backtesting our\n`arx_forecaster` on historical COVID-19 case data from the US and from Canada.\nMore precisely, we first demonstrate using `epi_slide()` to slide ARX\nforecasters over an `epi_df` object and compare the results obtained from using\ndifferent forecasting engines. We then compare these simple retrospective\nforecasts to more proper \"pseudoprospective\" forecasts generated using snapshots\nof the data that was available in real time, using `epix_slide()`.\n\n## Comparing different forecasting engines\n\n### Example using CLI and case data from US states \n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_d4d80a61c31b62ea1a5d61f1072177bf'}\n\n```{.r .cell-code}\nus_raw_history_dfs <-\n readRDS(system.file(\"extdata\", \"all_states_covidcast_signals.rds\",\n package = \"epipredict\", mustWork = TRUE\n ))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nAfter obtaining the latest snapshot of the data, we produce forecasts on that\ndata using the default engine of simple linear regression and compare to a\nrandom forest.\n\nNote that all of the warnings about the forecast date being less than the most\nrecent update date of the data have been suppressed to avoid cluttering the\noutput.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_501df4e2fa764c3cf5a36fe30f920bb9'}\n\n```{.r .cell-code}\n# Latest snapshot of data, and forecast dates\nus_latest <- epix_as_of(us_archive, max_version = max(us_archive$versions_end))\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nk_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) {\n epi_slide(epi_df, ~ arx_forecaster(\n .x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n select(-geo_value),\n before = 120L - 1L,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(geo_value, time_value, starts_with(\"fc\")) %>%\n mutate(engine_type = engine$engine)\n}\n\n# Generate the forecasts and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = rand_forest(mode = \"regression\")\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands. \n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states, \nwe'll just display the\nresults for two states, California (CA) and Florida (FL), to get a sense of the\nmodel performance while keeping the graphic simple. \n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_cf5ed426dadcf87aa72c873f89ba401b'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not\nresult in such in overconfident predictions (overly narrow confidence bands).\nThough, in general, neither approach produces amazingly accurate forecasts. \nThis could be because\nthe behaviour is rather different across states and the effects of other notable\nfactors such as age and public health measures may be important to account for\nin such forecasting. Including such factors as well as making enhancements such\nas correcting for outliers are some improvements one could make to this simple\nmodel.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to out-perform many far more complicated models in the online Covid forecasting due to those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_b65b7b3bca55827fcf187571942492de'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- readRDS(system.file(\n \"extdata\", \"can_prov_cases.rds\",\n package = \"epipredict\", mustWork = TRUE\n))\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version))\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x,\n boost_tree(mode = \"regression\", trees = 20)\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression. \n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_749e70213871f43929436d4a578868fa'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_145622420fe9517007923111890c3146'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Pseudoprospective vs. unfaithful retrospective forecasting\n\n### Example using case data from US states \n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. That is, we can\nmake forecasts on the archive, `us_archive`, and compare those to forecasts on\n(time windows of) the latest data, `us_latest`, using the same general set-up as\nabove. For pseudoprospective forecasting, note that `us_archive` is fed into\n`epix_slide()`, while for simpler (unfaithful) retrospective forecasting,\n`us_latest` is fed into `epi_slide()`. #%% update to include percent_cli after\nthat issue is fixed?\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-ar-kweek-asof_5e36fd086b20659f6706eda7f36c8d40'}\n\n```{.r .cell-code}\nk_week_versioning <- function(ahead, version = c(\"faithful\", \"unfaithful\")) {\n version <- match.arg(version)\n if (version == \"faithful\") {\n epix_slide(\n us_archive,\n ~ arx_forecaster(\n .x, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 120 - 1,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n mutate(version = \"version faithful\") %>%\n rename(geo_value = \"fc_geo_value\")\n } else {\n k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n ahead, linear_reg()\n ) %>% mutate(version = \"not version faithful\")\n }\n}\n\n# Generate the forecasts, and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_versioning(.x, \"faithful\")) %>% list_rbind(),\n map(aheads, ~ k_week_versioning(.x, \"unfaithful\")) %>% list_rbind()\n) %>% pivot_quantiles_wider(fc_.pred_distn)\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_c6417eaf4d97855d750b9f8aeb315d67'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(x = fc_target_date, group = time_value)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = version), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n scale_fill_brewer(palette = \"Set1\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/tidymodels-intro/execute-results/html.json b/_freeze/tidymodels-intro/execute-results/html.json index 779760d..91cdedc 100644 --- a/_freeze/tidymodels-intro/execute-results/html.json +++ b/_freeze/tidymodels-intro/execute-results/html.json @@ -1,7 +1,7 @@ { "hash": "3cb64a626abfcc9bc16433ba4801ec08", "result": { - "markdown": "\n# Introduction to Tidymodels\n\n\n::: {.cell hash='tidymodels-intro_cache/html/unnamed-chunk-1_d402ade014dce906a00ea52c5e7637ed'}\n\n:::\n\n\nR contains a universe of packages that each have their own unique interfaces (functions and argument names) and return types. For instance, simple linear regression in R is traditionally performed using `lm()` from the stats package, but there's also the option to use `glm`, `glmnet` or other packages. Similarly for random forest - a user has the option to use `ranger`, `randomForest`, or `xgboost` amongst other options. Having such a bevy of options is great, but it also adds complications to the modelling process.\n\nIf only there was a unifying interface available to help simplify and streamline the modelling process. This is the purpose of `tidymodels`, which provides a unified interface for modeling that abides by the [tidy philosphy](https://tidyverse.tidyverse.org/articles/paper.html#:~:text=Its%20primary%20goal%20is%20to,easier%20to%20learn%20the%20next) and that fits nicely into the tidyverse. From pre-processing to model training to prediction to validation, `tidymodels` provides the necessary tools to perform many modelling tasks.\n\nIt is important to understand that the `tidymodels` packages do not aim to implement the algorithms themseves, rather they provide the interface to bring together many disparate approaches under the same roof. And as a result of this, model fitting tasks are easier to carry out. In the grand scheme of things, here's where `tidymodels` tends to fit into a data analysis project.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-2_9ebbde4ead853f4c75fd41c8da8aac8a'}\n::: {.cell-output-display}\n![](img/tidymodels_packages.png){fig-align='center' width=90%}\n:::\n:::\n\n\nNow, modelling can be broken down into several sub-tasks, and `tidymodels` recognizes this by providing different packages for different tasks. So `tidymodels` can be considered a metapackage - when you load `tidymodels`, several packages are in fact loaded including `rsample`, `recipes`, `parsniup` and `yardstick`. Each of these packages has their own role to play in the modelling process.\n\n- `rsample` is intended for sampling and subsetting tasks (such as splitting the data into test and train sets)\n- `recipes` allows the user to easily and neatly record the steps to take in data pre-processing\n- `parsnip` provides a common interface for model training to help standardize the interface for model fitting and output\n- `yardstick` gives access to model performance measures\n\nThe following diagram shows where each package comes into play in a general workflow for modelling using `tidymodels`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-3_3e419eaced9bcacb0667e00ae55cb6b5'}\n::: {.cell-output-display}\n![](img/tidymodels_model_substeps.png){fig-align='center' width=90%}\n:::\n:::\n\n\n## An example using the penguins dataset\n\nWe will now explore the `tidymodels` functions using the `penguins` dataset that we introduced and used in [Regression in Tidymodels](LINK%20TO%20VIGNETTE).\n\n### Load packages\n\nNote that `tidymodels` automatically loads some very useful `tidyverse` packages for us, including fan favourites like `dplyr` and `ggplot2`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-4_2264c225b8ea3011e2081f83c730b65e'}\n\n```{.r .cell-code}\nlibrary(tidymodels)\n```\n:::\n\n\n### Simplify dataset\n\nTo keep the focus on learning how to use `tidymodels`, we will work with a simplified version of the dataset in which we will only use the complete cases/rows in the `penguins` dataset\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-5_67af100b153dab40f26460d41ac7fb15'}\n\n```{.r .cell-code}\npenguins <- penguins %>%\n filter(complete.cases(.))\n\nhead(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species island bill_length_mm bill_depth_mm flipper_length_mm\n#> \n#> 1 Adelie Torgersen 39.1 18.7 181\n#> 2 Adelie Torgersen 39.5 17.4 186\n#> 3 Adelie Torgersen 40.3 18 195\n#> 4 Adelie Torgersen 36.7 19.3 193\n#> 5 Adelie Torgersen 39.3 20.6 190\n#> 6 Adelie Torgersen 38.9 17.8 181\n#> # ℹ 2 more variables: body_mass_g , sex \n```\n:::\n:::\n\n\nand we will only use the `species`, `bill_length_mm`, `bill_depth_mm`, and `flipper_length_mm` variables.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-6_ef5ed1a8ec1402118f9763f6a2bf242e'}\n\n```{.r .cell-code}\npenguins <- penguins %>%\n select(c(species, bill_length_mm, bill_depth_mm, flipper_length_mm))\n```\n:::\n\n\n### Data sampling\n\nAfter fitting a model, make sure it is a good model. That is, don't forget to test how the model performs. For this reason, it is customary to split data into distinct training and test sets at the onset. The training data is used to fit the model and the test data is used to assess model performance.\n\nThe `initial_split()` function from the `rsample` package is what we will use to split our dataset into a training and test set. The function by default uses 3/4 of data for training and reserves the remaining 1/4 for testing. Use the `prop` argument to change the proportion used for training. Note that this function gives a `rsplit` object and not a data frame and the output of the object shows the number of rows used for testing, training and the grand total.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-7_9b76d434319551c35f834ca8a8a0e7b1'}\n\n```{.r .cell-code}\nset.seed(123) # For reproduciblity, as when we split the data below\npenguins_split <- initial_split(penguins, prop = 0.7)\npenguins_split\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> <233/100/333>\n```\n:::\n:::\n\n\nTo see what observations were used for training and testing, use the `training()` and `testing()` functions respectively.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-8_965d451cfc0d5363da62984b5411a2e3'}\n\n```{.r .cell-code}\npenguins_split %>%\n training() %>%\n glimpse()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Rows: 233\n#> Columns: 4\n#> $ species Gentoo, Adelie, Gentoo, Chinstrap, Adelie, Chinst…\n#> $ bill_length_mm 59.6, 34.4, 45.2, 49.0, 41.4, 51.0, 44.9, 51.1, 5…\n#> $ bill_depth_mm 17.0, 18.4, 15.8, 19.5, 18.5, 18.8, 13.8, 16.5, 1…\n#> $ flipper_length_mm 230, 184, 215, 210, 202, 203, 212, 225, 210, 211,…\n```\n:::\n:::\n\n\nNow, we'll create a data frame for each of the training and test set:\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-9_4dbd72c658687ca6c5c9dfc6962b1cbc'}\n\n```{.r .cell-code}\ntrain_data <- training(penguins_split)\ntest_data <- testing(penguins_split)\n```\n:::\n\n\n### Pre-processing\n\nThe main goal of this step is to use data transformations to make the data suitable for modeling. Most transformations that one required for standard data analysis tasks can be achieved by `dplyr`, or another `tidyverse` package.\n\n#### The pre-processing interface\n\nBefore training the model, a recipe can be used to carry out the pre-processing required by the model.\n\nThe `recipe()` has two main arguments: a formula (with the same format as when doing \\[LINK TO VIGNETTE\\]) and a data argument, which is usually the training set as that is the data used to create the model. Hence, we have `data = train_data` here.\n\nIn our example, suppose that our goal is to predict penguin species from bill length, bill depth and flipper length, then our recipe function would look as follows:\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-10_77ac862fee16d19160e6cf03adee22dc'}\n\n```{.r .cell-code}\nrecipe(species ~ ., data = train_data)\n```\n:::\n\n\nThe point of recipe is to be a more general purpose formula. A number of packages are not formula-based. The ever-popular `glmnet()` function is one example because it takes in matrices for the x and y variables instead of a formula. So a recipe is useful because you can use a package like `glmnet` by following the same standard formula-based recipe format and simply specify later on in the modelling stage that the you would like to use `glmnet`.\n\nNow, after saying that you are making a recipe by way of the `recipe()` function, simply specify the transformations that you want to apply to your data in the necessary steps. Each data transformation is one step and all of the available pre-processing transformations all have the prefix of `step_`. Now, while there are many step functions available ([here's](https://recipes.tidymodels.org/reference/index.html) a list), we will only use the following three in our example.\n\n- `step_corr()` to remove variables which have large absolute correlations with others\n\n- `step_center()` to normalize numeric data to have a mean of zero\n\n- `step_scale()` to normalize numeric data to have a standard deviation of one\n\nOne of the advantages of having these pre-processing steps is that they help to simplify concepts that are difficult or a pain to enforce in coding. For example, centering could be a nuisance to implement from scratch because we would first have to calculate statistics (variable averages) from the training data and then use them on both the training and on the test data. Note that centering should not be done on the test data, rather on the training data to avoid data leakage (contamination of the test data by using statistics from the test data). In a recipe, the the estimation of the variable means using the training data and the application of these to center new data sets is done automatically, under the hood, and so spares the coder from having to manually implement it. The situation is similar for scaling numeric data (`step_scale()`).\n\nAnother useful feature of the `tidymodels` pre-processing interface is that each step can be applied to one specified variable, a group of variables, or all variables. The `all_predictors()` and `all_outcomes()` functions are particularly convenient to help minimize the amount of typing you need to do. For instance, if you wanted to apply `step_center()` to only the predictor variables, simply type `step_center(all_predictors())` instead of listing out each and every predictor in the step function.\n\nNow, let's try this all out on our example.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-11_007b318e8247cae5fee6fa765ca79333'}\n\n```{.r .cell-code}\npenguins_recipe <- recipe(species ~ ., data = train_data) %>%\n step_corr(all_predictors()) %>%\n step_center(all_predictors(), -all_outcomes()) %>%\n step_scale(all_predictors(), -all_outcomes())\n```\n:::\n\n\nTo summarize, we obtained a recipe object, `penguins_recipe`, by putting the `recipe()` and step functions together on our training data that we had ready to go from sampling.\n\nNow, to get the recipe details, simply call the recipe object. The operations section details what pre-processing steps we've applied to the data. Notice that the steps shown here are in the order that they were input into the recipe and they specify the variables used in each step.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-12_a73027254138e5220218d014c0571de8'}\n\n```{.r .cell-code}\npenguins_recipe\n```\n:::\n\n\n### Model Training\n\nRecall that in R, the same type of model could be fit using several different packages, and each such package typically has it's own style of interface. Two popular packages to fit random forest models are `ranger` and `randomForest`. One way that their interfaces differ is in the parameter name for the number of trees - `ranger()` has the parameter `num.trees`, whereas in `randomForest` has parameter `ntree`. Such differences do not make it simple to run the model in the other package.\n\n`Tidymodels` created an single interface that supports the usage of both models. Moreover, this general interface supports an even wider range of functions that use perform random forest. The key part that points to the function and package to be used is the engine.\n\nLet's see how this works in practice. In the below example, we'll use the general `rand_forest()` function from `tidymodels`. In there, we can specify the number of trees by using the `trees` argument. Then, in `set_engine()` we specify that we want to use ranger's version of random forest. Notice this follows the model specification format introduced in the \\[Regression in Tidymodels\\] chapter.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-13_2ce2f34d915184be525e50766273bfe5'}\n\n```{.r .cell-code}\npenguins_ranger <- rand_forest(trees = 100, mode = \"classification\") %>%\n set_engine(\"ranger\")\n```\n:::\n\n\nNow, if we wanted to use a different package's version of random forest, we could easily do that by simply swapping out the engine. To try this out, let's use `randomForest` instead of `ranger`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-14_75b18395aafd9d4afdc081db7a05fb63'}\n\n```{.r .cell-code}\npenguins_rf <- rand_forest(trees = 100, mode = \"classification\") %>%\n set_engine(\"randomForest\")\n```\n:::\n\n\nFor the remainder of this tutorial, we'll stick with using `ranger` for simplify. At this stage, we're ready to pre-process and model. The first task of those two is to apply our recipe before we train and test our model, in that we must\n\n1. Process the recipe using the training set.\n\n2. Use the recipe on the training set to get the finalized predictor set.\n\n3. Use the recipe on the predictor set to get the test set.\n\nA workflow can be used to pair model and processing tasks together. When different recipes are needed for different models, this is very useful so that you don't have to keep track of separate model and recipe objects in your workspace. Hence, training and testing different workflows becomes easier.\n\nFor our example, we'll try tidy model's workflows package to pair our model and our recipe together.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-15_d496ee9edf143e935990b725debaaec7'}\n\n```{.r .cell-code}\npenguins_wflow <- workflow() %>%\n add_model(penguins_ranger) %>%\n add_recipe(penguins_recipe)\n\npenguins_wflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow ═════════════════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: rand_forest()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 3 Recipe Steps\n#> \n#> • step_corr()\n#> • step_center()\n#> • step_scale()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> Random Forest Model Specification (classification)\n#> \n#> Main Arguments:\n#> trees = 100\n#> \n#> Computational engine: ranger\n```\n:::\n:::\n\n\nAfter that, we're ready to fit the model to our training data. The `fit()` function is what we will use to prepare the the recipe and train the model from the finalized predictors.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-16_7c12180c3e71f078841b3a8df9ca54dc'}\n\n```{.r .cell-code}\npenguins_fit <- penguins_wflow %>% fit(data = train_data)\n```\n:::\n\n\nThe resulting object contains both the recipe and fitted model. To extract the model, use the helper function of `extract_fit_parsnip()`, and to extract the recipe object, use `extract_recipe()`. We extract the model object below.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-17_adf67ad4f74f01b4a2e09821534fca75'}\n\n```{.r .cell-code}\nextract_fit_parsnip(penguins_fit)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~100, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 100 \n#> Sample size: 233 \n#> Number of independent variables: 3 \n#> Mtry: 1 \n#> Target node size: 10 \n#> Variable importance mode: none \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.02954337\n```\n:::\n:::\n\n\nOne important thing to notice is that that if we wanted to use the `randomForest` model instead of the `ranger` model, all we'd need to do is replace the engine in the model specification; the rest of the code remains the same. We shall leave it to the reader to try this on their own and marvel at the beauty of having such a unifying interface.\n\n### Use a trained workflow to predict\n\nUp to this point we have\n\n1. Built the model (`penguins_ranger`)\n\n2. Created a pre-processing recipe (`penguins_recipe`),\n\n3. Paired the model and recipe (`penguins_wflow`), and\n\n4. Trained our workflow using `fit()`.\n\nSo the next step is to use the trained workflow, `penguins_fit`, to predict with the test data. This is easily done with a call to `predict()`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-18_48e696c9d90b3c6d7c8251337e6b4991'}\n\n```{.r .cell-code}\npredict(penguins_fit, test_data)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 100 × 1\n#> .pred_class\n#> \n#> 1 Adelie \n#> 2 Adelie \n#> 3 Adelie \n#> 4 Chinstrap \n#> 5 Adelie \n#> 6 Adelie \n#> # ℹ 94 more rows\n```\n:::\n:::\n\n\nIf you wanted to obtain a probability for each predicted value, then simply set the `type = prob` in `predict()`. This will yield a tibble with one column per outcome type and the corresponding predicted probability for each value to be each type of outcome. Then, to add the predicted values as a new column on the test data, use the `bind_cols()` function from `dplyr`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-19_af9e880b31504b55f0de2461d26ce93e'}\n\n```{.r .cell-code}\npenguins_predp <- penguins_fit %>%\n predict(test_data, type = \"prob\")\n```\n:::\n\n\nTo add the predicted values as a new column on the test data, you can use the `bind_cols()` function from `dplyr`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-20_c0637410ff018df353bfded24b041a1a'}\n\n```{.r .cell-code}\nbind_cols(test_data, penguins_predp) %>%\n head() # View first six rows of output\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species bill_length_mm bill_depth_mm flipper_length_mm .pred_Adelie\n#> \n#> 1 Adelie 39.5 17.4 186 0.910\n#> 2 Adelie 40.3 18 195 0.960\n#> 3 Adelie 38.7 19 195 0.964\n#> 4 Adelie 46 21.5 194 0.286\n#> 5 Adelie 35.9 19.2 189 0.997\n#> 6 Adelie 38.2 18.1 185 1 \n#> # ℹ 2 more variables: .pred_Chinstrap , .pred_Gentoo \n```\n:::\n:::\n\n\nAlternatively, we can use the `augment()` function to obtain the predicted probabilities and add them to the test data in a one-liner.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-21_4969b7fc4d20b43b37f5df734074efec'}\n\n```{.r .cell-code}\npenguins_aug <- augment(penguins_fit, test_data)\n\npenguins_aug\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 100 × 8\n#> species bill_length_mm bill_depth_mm flipper_length_mm .pred_class\n#> \n#> 1 Adelie 39.5 17.4 186 Adelie \n#> 2 Adelie 40.3 18 195 Adelie \n#> 3 Adelie 38.7 19 195 Adelie \n#> 4 Adelie 46 21.5 194 Chinstrap \n#> 5 Adelie 35.9 19.2 189 Adelie \n#> 6 Adelie 38.2 18.1 185 Adelie \n#> # ℹ 94 more rows\n#> # ℹ 3 more variables: .pred_Adelie , .pred_Chinstrap , …\n```\n:::\n:::\n\n\nWe can see from the first couple of rows shown that our model predicted the species correctly to be Adelie (in the `.pred_class` column) because the `.pred_Adelie` probabilities are by far the largest of the three predicted probabilities for each row. So while we can informally say that our model is doing well for predicting, how can we formally assess this? We would like to calculate a metric (well, probably more than one) to tell us how well our model predicts the species of penguins.\n\n### Model Validation\n\nThe `metrics()` function from the `yardstick` package is helps to assess model performance. As suggested by its name, it will output some metrics, and as an added bonus, these will be automatically selected for the type of model that you construct. The input for this function is a tibble that contains the actual values and the predicted values. This way we can compare how close the model estimates are to the truth. To serve this purpose, we can use `penguins_aug`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-22_b8252ed2ea48acc351d82910705fa3ba'}\n\n```{.r .cell-code}\npenguins_aug %>%\n metrics(truth = species, .pred_Adelie:.pred_Gentoo, estimate = .pred_class)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 4 × 3\n#> .metric .estimator .estimate\n#> \n#> 1 accuracy multiclass 0.97 \n#> 2 kap multiclass 0.954\n#> 3 mn_log_loss multiclass 0.123\n#> 4 roc_auc hand_till 0.993\n```\n:::\n:::\n\n\nLet's briefly go through the metrics that were generated. Accuracy is simply the proportion of values that are predicted correctly, while kappa is similar to accuracy, but is normalized by the accuracy that would be expected by chance (you can think of it as a measure that compares observed accuracy to expected accuracy from random chance alone). For our example, both the accuracy and kappa value estimates are extremely high (near to the upper limit of 1) and similar in value, indicating that our model performs very well for prediction on the test data. Log loss is a measure of the performance of a classification model and a perfect model has a log loss of 0, so our model performs pretty well in that respect. Finally, `roc_auc` is the area under ROC curve and we'll explain this very shortly so stay tuned (for now, just note that a value close to 1, like we have, is the goal). All in all, our model fairs very well.\n\nSince it is often not enough to rely purely on one number summaries of model performance, we'll also look to graphical, curve-based metrics. We'll walk through producing the classic ROC curve, which is computed using `roc_curve()` and `roc_auc()` from `yardstick`.\n\nTo get ourselves an ROC curve, we need to input the actual values and the columns of predicted class probabilities into `roc_curve()`. We finish off by piping into the `autoplot()` function.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-23_4f33465326b36ba9f605f8144e95ba64'}\n\n```{.r .cell-code}\npenguins_aug %>%\n roc_curve(truth = species, .pred_Adelie:.pred_Gentoo) %>%\n autoplot()\n```\n\n::: {.cell-output-display}\n![](tidymodels-intro_files/figure-html/unnamed-chunk-23-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nNotice that the x-axis displays 1 - specificity, which is otherwise known as the false positive rate. So on this plot, we can visualize the trade-off between the false positive (1 - specificity) and the true positive (sensitivity) rates. Since the best classification would be where all positives are correctly classified as positive (sensitivity = 1), and no negatives are incorrect classified as positive (specificity = 0), curves closer to the top left corner (and, hence, an area under the curve of about 1) is what we're hoping for.\n\nSo, we can see that the curves for each of our species are looking pretty close to perfection (save for Adelie, which still does very well). To estimate the area under the curves, we can use `roc_auc` (or look to the summary of our metrics above for this very value).\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-24_2f383612e8ab94d7d60b43f6df1b72f3'}\n\n```{.r .cell-code}\npenguins_aug %>%\n roc_auc(truth = species, .pred_Adelie:.pred_Gentoo)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> \n#> 1 roc_auc hand_till 0.993\n```\n:::\n:::\n\n\nAs expected, the estimated area is very close to 1, indicating near-perfect discrimination.\n\nThe `yardstick` package also offers other standard tools for model assessment like a confusion matrix, from which we can inspect the counts of correct classifications and miclassifications.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-25_976809de4dcd6ea5a1da1608bf941c16'}\n\n```{.r .cell-code}\npenguins_aug %>%\n conf_mat(truth = species, estimate = .pred_class)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Truth\n#> Prediction Adelie Chinstrap Gentoo\n#> Adelie 40 0 0\n#> Chinstrap 3 24 0\n#> Gentoo 0 0 33\n```\n:::\n:::\n\n\nWe could even combine this with `autoplot()` to get a nice heatmap visualization.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-26_96de13131862791cf00497058a2f1a03'}\n\n```{.r .cell-code}\npenguins_aug %>%\n conf_mat(truth = species, estimate = .pred_class) %>%\n autoplot(\"heatmap\")\n```\n\n::: {.cell-output-display}\n![](tidymodels-intro_files/figure-html/unnamed-chunk-26-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThe diagonal shows the counts of correct predictions for each species, while the off-diagonal shows the counts of model misclassifications. As the metrics have indicated, our model performed magnificently on the test set as there was only three misclassifications of Adelie penguins as Chinstrap.\n\n## Concluding remarks\n\nIn this vignette, we introduced `tidymodels` and illustrated how to its packages work together by way of example. Since this was an elementary example, so use this as a starting point and explore what more can be done with this wonderful set of packages. And yet, however wonderful they are, you may have already noticed that there are limitations like the glaring lack of a set of post-processing tools to refine the results. We fill this gap for epidemiological modelling with [frosting](https://cmu-delphi.github.io/epipredict/reference/add_frosting.html). This will be formally introduced in a later chapter, so stay tuned!\\\n\\\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n\n## Attribution\n\n\nThis Chapter was largely adapted from [A Gentle Introduction to Tidymodels](https://rviews.rstudio.com/2019/06/19/a-gentle-intro-to-tidymodels/) as well as [Tidymodels - Getting Started](https://www.tidymodels.org/start/recipes/) and [Tidymodels](https://wec.wur.nl/dse/24-tidymodels.html). The diagrams are from [A Gentle Introduction to Tidymodels](https://rviews.rstudio.com/2019/06/19/a-gentle-intro-to-tidymodels/) and based on [R for Data Science](https://r4ds.had.co.nz/explore-intro.html).\n\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n", + "markdown": "\n# Introduction to Tidymodels\n\n\n::: {.cell}\n\n:::\n\n\nR contains a universe of packages that each have their own unique interfaces (functions and argument names) and return types. For instance, simple linear regression in R is traditionally performed using `lm()` from the stats package, but there's also the option to use `glm`, `glmnet` or other packages. Similarly for random forest - a user has the option to use `ranger`, `randomForest`, or `xgboost` amongst other options. Having such a bevy of options is great, but it also adds complications to the modelling process.\n\nIf only there was a unifying interface available to help simplify and streamline the modelling process. This is the purpose of `tidymodels`, which provides a unified interface for modeling that abides by the [tidy philosphy](https://tidyverse.tidyverse.org/articles/paper.html#:~:text=Its%20primary%20goal%20is%20to,easier%20to%20learn%20the%20next) and that fits nicely into the tidyverse. From pre-processing to model training to prediction to validation, `tidymodels` provides the necessary tools to perform many modelling tasks.\n\nIt is important to understand that the `tidymodels` packages do not aim to implement the algorithms themseves, rather they provide the interface to bring together many disparate approaches under the same roof. And as a result of this, model fitting tasks are easier to carry out. In the grand scheme of things, here's where `tidymodels` tends to fit into a data analysis project.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-2_9ebbde4ead853f4c75fd41c8da8aac8a'}\n::: {.cell-output-display}\n![](img/tidymodels_packages.png){fig-align='center' width=90%}\n:::\n:::\n\n\nNow, modelling can be broken down into several sub-tasks, and `tidymodels` recognizes this by providing different packages for different tasks. So `tidymodels` can be considered a metapackage - when you load `tidymodels`, several packages are in fact loaded including `rsample`, `recipes`, `parsniup` and `yardstick`. Each of these packages has their own role to play in the modelling process.\n\n- `rsample` is intended for sampling and subsetting tasks (such as splitting the data into test and train sets)\n- `recipes` allows the user to easily and neatly record the steps to take in data pre-processing\n- `parsnip` provides a common interface for model training to help standardize the interface for model fitting and output\n- `yardstick` gives access to model performance measures\n\nThe following diagram shows where each package comes into play in a general workflow for modelling using `tidymodels`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-3_3e419eaced9bcacb0667e00ae55cb6b5'}\n::: {.cell-output-display}\n![](img/tidymodels_model_substeps.png){fig-align='center' width=90%}\n:::\n:::\n\n\n## An example using the penguins dataset\n\nWe will now explore the `tidymodels` functions using the `penguins` dataset that we introduced and used in [Regression in Tidymodels](LINK%20TO%20VIGNETTE).\n\n### Load packages\n\nNote that `tidymodels` automatically loads some very useful `tidyverse` packages for us, including fan favourites like `dplyr` and `ggplot2`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-4_2264c225b8ea3011e2081f83c730b65e'}\n\n```{.r .cell-code}\nlibrary(tidymodels)\n```\n:::\n\n\n### Simplify dataset\n\nTo keep the focus on learning how to use `tidymodels`, we will work with a simplified version of the dataset in which we will only use the complete cases/rows in the `penguins` dataset\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-5_67af100b153dab40f26460d41ac7fb15'}\n\n```{.r .cell-code}\npenguins <- penguins %>%\n filter(complete.cases(.))\n\nhead(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species island bill_length_mm bill_depth_mm flipper_length_mm\n#> \n#> 1 Adelie Torgersen 39.1 18.7 181\n#> 2 Adelie Torgersen 39.5 17.4 186\n#> 3 Adelie Torgersen 40.3 18 195\n#> 4 Adelie Torgersen 36.7 19.3 193\n#> 5 Adelie Torgersen 39.3 20.6 190\n#> 6 Adelie Torgersen 38.9 17.8 181\n#> # ℹ 2 more variables: body_mass_g , sex \n```\n:::\n:::\n\n\nand we will only use the `species`, `bill_length_mm`, `bill_depth_mm`, and `flipper_length_mm` variables.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-6_ef5ed1a8ec1402118f9763f6a2bf242e'}\n\n```{.r .cell-code}\npenguins <- penguins %>%\n select(c(species, bill_length_mm, bill_depth_mm, flipper_length_mm))\n```\n:::\n\n\n### Data sampling\n\nAfter fitting a model, make sure it is a good model. That is, don't forget to test how the model performs. For this reason, it is customary to split data into distinct training and test sets at the onset. The training data is used to fit the model and the test data is used to assess model performance.\n\nThe `initial_split()` function from the `rsample` package is what we will use to split our dataset into a training and test set. The function by default uses 3/4 of data for training and reserves the remaining 1/4 for testing. Use the `prop` argument to change the proportion used for training. Note that this function gives a `rsplit` object and not a data frame and the output of the object shows the number of rows used for testing, training and the grand total.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-7_9b76d434319551c35f834ca8a8a0e7b1'}\n\n```{.r .cell-code}\nset.seed(123) # For reproduciblity, as when we split the data below\npenguins_split <- initial_split(penguins, prop = 0.7)\npenguins_split\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> <233/100/333>\n```\n:::\n:::\n\n\nTo see what observations were used for training and testing, use the `training()` and `testing()` functions respectively.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-8_965d451cfc0d5363da62984b5411a2e3'}\n\n```{.r .cell-code}\npenguins_split %>%\n training() %>%\n glimpse()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Rows: 233\n#> Columns: 4\n#> $ species Gentoo, Adelie, Gentoo, Chinstrap, Adelie, Chinst…\n#> $ bill_length_mm 59.6, 34.4, 45.2, 49.0, 41.4, 51.0, 44.9, 51.1, 5…\n#> $ bill_depth_mm 17.0, 18.4, 15.8, 19.5, 18.5, 18.8, 13.8, 16.5, 1…\n#> $ flipper_length_mm 230, 184, 215, 210, 202, 203, 212, 225, 210, 211,…\n```\n:::\n:::\n\n\nNow, we'll create a data frame for each of the training and test set:\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-9_4dbd72c658687ca6c5c9dfc6962b1cbc'}\n\n```{.r .cell-code}\ntrain_data <- training(penguins_split)\ntest_data <- testing(penguins_split)\n```\n:::\n\n\n### Pre-processing\n\nThe main goal of this step is to use data transformations to make the data suitable for modeling. Most transformations that one required for standard data analysis tasks can be achieved by `dplyr`, or another `tidyverse` package.\n\n#### The pre-processing interface\n\nBefore training the model, a recipe can be used to carry out the pre-processing required by the model.\n\nThe `recipe()` has two main arguments: a formula (with the same format as when doing \\[LINK TO VIGNETTE\\]) and a data argument, which is usually the training set as that is the data used to create the model. Hence, we have `data = train_data` here.\n\nIn our example, suppose that our goal is to predict penguin species from bill length, bill depth and flipper length, then our recipe function would look as follows:\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-10_77ac862fee16d19160e6cf03adee22dc'}\n\n```{.r .cell-code}\nrecipe(species ~ ., data = train_data)\n```\n:::\n\n\nThe point of recipe is to be a more general purpose formula. A number of packages are not formula-based. The ever-popular `glmnet()` function is one example because it takes in matrices for the x and y variables instead of a formula. So a recipe is useful because you can use a package like `glmnet` by following the same standard formula-based recipe format and simply specify later on in the modelling stage that the you would like to use `glmnet`.\n\nNow, after saying that you are making a recipe by way of the `recipe()` function, simply specify the transformations that you want to apply to your data in the necessary steps. Each data transformation is one step and all of the available pre-processing transformations all have the prefix of `step_`. Now, while there are many step functions available ([here's](https://recipes.tidymodels.org/reference/index.html) a list), we will only use the following three in our example.\n\n- `step_corr()` to remove variables which have large absolute correlations with others\n\n- `step_center()` to normalize numeric data to have a mean of zero\n\n- `step_scale()` to normalize numeric data to have a standard deviation of one\n\nOne of the advantages of having these pre-processing steps is that they help to simplify concepts that are difficult or a pain to enforce in coding. For example, centering could be a nuisance to implement from scratch because we would first have to calculate statistics (variable averages) from the training data and then use them on both the training and on the test data. Note that centering should not be done on the test data, rather on the training data to avoid data leakage (contamination of the test data by using statistics from the test data). In a recipe, the the estimation of the variable means using the training data and the application of these to center new data sets is done automatically, under the hood, and so spares the coder from having to manually implement it. The situation is similar for scaling numeric data (`step_scale()`).\n\nAnother useful feature of the `tidymodels` pre-processing interface is that each step can be applied to one specified variable, a group of variables, or all variables. The `all_predictors()` and `all_outcomes()` functions are particularly convenient to help minimize the amount of typing you need to do. For instance, if you wanted to apply `step_center()` to only the predictor variables, simply type `step_center(all_predictors())` instead of listing out each and every predictor in the step function.\n\nNow, let's try this all out on our example.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-11_007b318e8247cae5fee6fa765ca79333'}\n\n```{.r .cell-code}\npenguins_recipe <- recipe(species ~ ., data = train_data) %>%\n step_corr(all_predictors()) %>%\n step_center(all_predictors(), -all_outcomes()) %>%\n step_scale(all_predictors(), -all_outcomes())\n```\n:::\n\n\nTo summarize, we obtained a recipe object, `penguins_recipe`, by putting the `recipe()` and step functions together on our training data that we had ready to go from sampling.\n\nNow, to get the recipe details, simply call the recipe object. The operations section details what pre-processing steps we've applied to the data. Notice that the steps shown here are in the order that they were input into the recipe and they specify the variables used in each step.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-12_a73027254138e5220218d014c0571de8'}\n\n```{.r .cell-code}\npenguins_recipe\n```\n:::\n\n\n### Model Training\n\nRecall that in R, the same type of model could be fit using several different packages, and each such package typically has it's own style of interface. Two popular packages to fit random forest models are `ranger` and `randomForest`. One way that their interfaces differ is in the parameter name for the number of trees - `ranger()` has the parameter `num.trees`, whereas in `randomForest` has parameter `ntree`. Such differences do not make it simple to run the model in the other package.\n\n`Tidymodels` created an single interface that supports the usage of both models. Moreover, this general interface supports an even wider range of functions that use perform random forest. The key part that points to the function and package to be used is the engine.\n\nLet's see how this works in practice. In the below example, we'll use the general `rand_forest()` function from `tidymodels`. In there, we can specify the number of trees by using the `trees` argument. Then, in `set_engine()` we specify that we want to use ranger's version of random forest. Notice this follows the model specification format introduced in the \\[Regression in Tidymodels\\] chapter.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-13_2ce2f34d915184be525e50766273bfe5'}\n\n```{.r .cell-code}\npenguins_ranger <- rand_forest(trees = 100, mode = \"classification\") %>%\n set_engine(\"ranger\")\n```\n:::\n\n\nNow, if we wanted to use a different package's version of random forest, we could easily do that by simply swapping out the engine. To try this out, let's use `randomForest` instead of `ranger`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-14_75b18395aafd9d4afdc081db7a05fb63'}\n\n```{.r .cell-code}\npenguins_rf <- rand_forest(trees = 100, mode = \"classification\") %>%\n set_engine(\"randomForest\")\n```\n:::\n\n\nFor the remainder of this tutorial, we'll stick with using `ranger` for simplify. At this stage, we're ready to pre-process and model. The first task of those two is to apply our recipe before we train and test our model, in that we must\n\n1. Process the recipe using the training set.\n\n2. Use the recipe on the training set to get the finalized predictor set.\n\n3. Use the recipe on the predictor set to get the test set.\n\nA workflow can be used to pair model and processing tasks together. When different recipes are needed for different models, this is very useful so that you don't have to keep track of separate model and recipe objects in your workspace. Hence, training and testing different workflows becomes easier.\n\nFor our example, we'll try tidy model's workflows package to pair our model and our recipe together.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-15_d496ee9edf143e935990b725debaaec7'}\n\n```{.r .cell-code}\npenguins_wflow <- workflow() %>%\n add_model(penguins_ranger) %>%\n add_recipe(penguins_recipe)\n\npenguins_wflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow ═════════════════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: rand_forest()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 3 Recipe Steps\n#> \n#> • step_corr()\n#> • step_center()\n#> • step_scale()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> Random Forest Model Specification (classification)\n#> \n#> Main Arguments:\n#> trees = 100\n#> \n#> Computational engine: ranger\n```\n:::\n:::\n\n\nAfter that, we're ready to fit the model to our training data. The `fit()` function is what we will use to prepare the the recipe and train the model from the finalized predictors.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-16_7c12180c3e71f078841b3a8df9ca54dc'}\n\n```{.r .cell-code}\npenguins_fit <- penguins_wflow %>% fit(data = train_data)\n```\n:::\n\n\nThe resulting object contains both the recipe and fitted model. To extract the model, use the helper function of `extract_fit_parsnip()`, and to extract the recipe object, use `extract_recipe()`. We extract the model object below.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-17_adf67ad4f74f01b4a2e09821534fca75'}\n\n```{.r .cell-code}\nextract_fit_parsnip(penguins_fit)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~100, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 100 \n#> Sample size: 233 \n#> Number of independent variables: 3 \n#> Mtry: 1 \n#> Target node size: 10 \n#> Variable importance mode: none \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.03000721\n```\n:::\n:::\n\n\nOne important thing to notice is that that if we wanted to use the `randomForest` model instead of the `ranger` model, all we'd need to do is replace the engine in the model specification; the rest of the code remains the same. We shall leave it to the reader to try this on their own and marvel at the beauty of having such a unifying interface.\n\n### Use a trained workflow to predict\n\nUp to this point we have\n\n1. Built the model (`penguins_ranger`)\n\n2. Created a pre-processing recipe (`penguins_recipe`),\n\n3. Paired the model and recipe (`penguins_wflow`), and\n\n4. Trained our workflow using `fit()`.\n\nSo the next step is to use the trained workflow, `penguins_fit`, to predict with the test data. This is easily done with a call to `predict()`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-18_48e696c9d90b3c6d7c8251337e6b4991'}\n\n```{.r .cell-code}\npredict(penguins_fit, test_data)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 100 × 1\n#> .pred_class\n#> \n#> 1 Adelie \n#> 2 Adelie \n#> 3 Adelie \n#> 4 Chinstrap \n#> 5 Adelie \n#> 6 Adelie \n#> # ℹ 94 more rows\n```\n:::\n:::\n\n\nIf you wanted to obtain a probability for each predicted value, then simply set the `type = prob` in `predict()`. This will yield a tibble with one column per outcome type and the corresponding predicted probability for each value to be each type of outcome. Then, to add the predicted values as a new column on the test data, use the `bind_cols()` function from `dplyr`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-19_af9e880b31504b55f0de2461d26ce93e'}\n\n```{.r .cell-code}\npenguins_predp <- penguins_fit %>%\n predict(test_data, type = \"prob\")\n```\n:::\n\n\nTo add the predicted values as a new column on the test data, you can use the `bind_cols()` function from `dplyr`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-20_c0637410ff018df353bfded24b041a1a'}\n\n```{.r .cell-code}\nbind_cols(test_data, penguins_predp) %>%\n head() # View first six rows of output\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species bill_length_mm bill_depth_mm flipper_length_mm .pred_Adelie\n#> \n#> 1 Adelie 39.5 17.4 186 0.895\n#> 2 Adelie 40.3 18 195 0.964\n#> 3 Adelie 38.7 19 195 0.960\n#> 4 Adelie 46 21.5 194 0.313\n#> 5 Adelie 35.9 19.2 189 0.999\n#> 6 Adelie 38.2 18.1 185 0.983\n#> # ℹ 2 more variables: .pred_Chinstrap , .pred_Gentoo \n```\n:::\n:::\n\n\nAlternatively, we can use the `augment()` function to obtain the predicted probabilities and add them to the test data in a one-liner.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-21_4969b7fc4d20b43b37f5df734074efec'}\n\n```{.r .cell-code}\npenguins_aug <- augment(penguins_fit, test_data)\n\npenguins_aug\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 100 × 8\n#> species bill_length_mm bill_depth_mm flipper_length_mm .pred_class\n#> \n#> 1 Adelie 39.5 17.4 186 Adelie \n#> 2 Adelie 40.3 18 195 Adelie \n#> 3 Adelie 38.7 19 195 Adelie \n#> 4 Adelie 46 21.5 194 Chinstrap \n#> 5 Adelie 35.9 19.2 189 Adelie \n#> 6 Adelie 38.2 18.1 185 Adelie \n#> # ℹ 94 more rows\n#> # ℹ 3 more variables: .pred_Adelie , .pred_Chinstrap , …\n```\n:::\n:::\n\n\nWe can see from the first couple of rows shown that our model predicted the species correctly to be Adelie (in the `.pred_class` column) because the `.pred_Adelie` probabilities are by far the largest of the three predicted probabilities for each row. So while we can informally say that our model is doing well for predicting, how can we formally assess this? We would like to calculate a metric (well, probably more than one) to tell us how well our model predicts the species of penguins.\n\n### Model Validation\n\nThe `metrics()` function from the `yardstick` package is helps to assess model performance. As suggested by its name, it will output some metrics, and as an added bonus, these will be automatically selected for the type of model that you construct. The input for this function is a tibble that contains the actual values and the predicted values. This way we can compare how close the model estimates are to the truth. To serve this purpose, we can use `penguins_aug`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-22_b8252ed2ea48acc351d82910705fa3ba'}\n\n```{.r .cell-code}\npenguins_aug %>%\n metrics(truth = species, .pred_Adelie:.pred_Gentoo, estimate = .pred_class)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 4 × 3\n#> .metric .estimator .estimate\n#> \n#> 1 accuracy multiclass 0.97 \n#> 2 kap multiclass 0.954\n#> 3 mn_log_loss multiclass 0.127\n#> 4 roc_auc hand_till 0.992\n```\n:::\n:::\n\n\nLet's briefly go through the metrics that were generated. Accuracy is simply the proportion of values that are predicted correctly, while kappa is similar to accuracy, but is normalized by the accuracy that would be expected by chance (you can think of it as a measure that compares observed accuracy to expected accuracy from random chance alone). For our example, both the accuracy and kappa value estimates are extremely high (near to the upper limit of 1) and similar in value, indicating that our model performs very well for prediction on the test data. Log loss is a measure of the performance of a classification model and a perfect model has a log loss of 0, so our model performs pretty well in that respect. Finally, `roc_auc` is the area under ROC curve and we'll explain this very shortly so stay tuned (for now, just note that a value close to 1, like we have, is the goal). All in all, our model fairs very well.\n\nSince it is often not enough to rely purely on one number summaries of model performance, we'll also look to graphical, curve-based metrics. We'll walk through producing the classic ROC curve, which is computed using `roc_curve()` and `roc_auc()` from `yardstick`.\n\nTo get ourselves an ROC curve, we need to input the actual values and the columns of predicted class probabilities into `roc_curve()`. We finish off by piping into the `autoplot()` function.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-23_4f33465326b36ba9f605f8144e95ba64'}\n\n```{.r .cell-code}\npenguins_aug %>%\n roc_curve(truth = species, .pred_Adelie:.pred_Gentoo) %>%\n autoplot()\n```\n\n::: {.cell-output-display}\n![](tidymodels-intro_files/figure-html/unnamed-chunk-23-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nNotice that the x-axis displays 1 - specificity, which is otherwise known as the false positive rate. So on this plot, we can visualize the trade-off between the false positive (1 - specificity) and the true positive (sensitivity) rates. Since the best classification would be where all positives are correctly classified as positive (sensitivity = 1), and no negatives are incorrect classified as positive (specificity = 0), curves closer to the top left corner (and, hence, an area under the curve of about 1) is what we're hoping for.\n\nSo, we can see that the curves for each of our species are looking pretty close to perfection (save for Adelie, which still does very well). To estimate the area under the curves, we can use `roc_auc` (or look to the summary of our metrics above for this very value).\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-24_2f383612e8ab94d7d60b43f6df1b72f3'}\n\n```{.r .cell-code}\npenguins_aug %>%\n roc_auc(truth = species, .pred_Adelie:.pred_Gentoo)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> \n#> 1 roc_auc hand_till 0.992\n```\n:::\n:::\n\n\nAs expected, the estimated area is very close to 1, indicating near-perfect discrimination.\n\nThe `yardstick` package also offers other standard tools for model assessment like a confusion matrix, from which we can inspect the counts of correct classifications and miclassifications.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-25_976809de4dcd6ea5a1da1608bf941c16'}\n\n```{.r .cell-code}\npenguins_aug %>%\n conf_mat(truth = species, estimate = .pred_class)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Truth\n#> Prediction Adelie Chinstrap Gentoo\n#> Adelie 40 0 0\n#> Chinstrap 2 24 0\n#> Gentoo 1 0 33\n```\n:::\n:::\n\n\nWe could even combine this with `autoplot()` to get a nice heatmap visualization.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-intro_cache/html/unnamed-chunk-26_96de13131862791cf00497058a2f1a03'}\n\n```{.r .cell-code}\npenguins_aug %>%\n conf_mat(truth = species, estimate = .pred_class) %>%\n autoplot(\"heatmap\")\n```\n\n::: {.cell-output-display}\n![](tidymodels-intro_files/figure-html/unnamed-chunk-26-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThe diagonal shows the counts of correct predictions for each species, while the off-diagonal shows the counts of model misclassifications. As the metrics have indicated, our model performed magnificently on the test set as there was only three misclassifications of Adelie penguins as Chinstrap.\n\n## Concluding remarks\n\nIn this vignette, we introduced `tidymodels` and illustrated how to its packages work together by way of example. Since this was an elementary example, so use this as a starting point and explore what more can be done with this wonderful set of packages. And yet, however wonderful they are, you may have already noticed that there are limitations like the glaring lack of a set of post-processing tools to refine the results. We fill this gap for epidemiological modelling with [frosting](https://cmu-delphi.github.io/epipredict/reference/add_frosting.html). This will be formally introduced in a later chapter, so stay tuned!\\\n\\\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n\n## Attribution\n\n\nThis Chapter was largely adapted from [A Gentle Introduction to Tidymodels](https://rviews.rstudio.com/2019/06/19/a-gentle-intro-to-tidymodels/) as well as [Tidymodels - Getting Started](https://www.tidymodels.org/start/recipes/) and [Tidymodels](https://wec.wur.nl/dse/24-tidymodels.html). The diagrams are from [A Gentle Introduction to Tidymodels](https://rviews.rstudio.com/2019/06/19/a-gentle-intro-to-tidymodels/) and based on [R for Data Science](https://r4ds.had.co.nz/explore-intro.html).\n\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/tidymodels-intro/figure-html/unnamed-chunk-23-1.svg b/_freeze/tidymodels-intro/figure-html/unnamed-chunk-23-1.svg index 6219289..d448cb4 100644 --- a/_freeze/tidymodels-intro/figure-html/unnamed-chunk-23-1.svg +++ b/_freeze/tidymodels-intro/figure-html/unnamed-chunk-23-1.svgdiff --git a/_freeze/tidymodels-intro/figure-html/unnamed-chunk-26-1.svg b/_freeze/tidymodels-intro/figure-html/unnamed-chunk-26-1.svg index 26dd930..f95571b 100644 --- a/_freeze/tidymodels-intro/figure-html/unnamed-chunk-26-1.svg +++ b/_freeze/tidymodels-intro/figure-html/unnamed-chunk-26-1.svgdiff --git a/_freeze/tidymodels-regression/execute-results/html.json b/_freeze/tidymodels-regression/execute-results/html.json index c2f8b4e..1ddec90 100644 --- a/_freeze/tidymodels-regression/execute-results/html.json +++ b/_freeze/tidymodels-regression/execute-results/html.json @@ -1,7 +1,7 @@ { "hash": "11af347186b163fee8f2e7de58f8f919", "result": { - "markdown": "# Regression in Tidymodels\n\n\n::: {.cell hash='tidymodels-regression_cache/html/unnamed-chunk-1_5f143964b13de5b28ac37a5c751a7e1f'}\n\n:::\n\n\nThis vignette is a gentle introduction into performing simple and multiple linear regression using `tidymodels`. Model fitting will be done using [parsnip](https://www.tidymodels.org/start/models/), which provides a unifying interface for model fitting and the resulting output. This means that parsnip provides a single interface with standardized argument names for each class of models so that you don't have to directly deal with the different interfaces for different functions that aim to do the same thing (like linear regression). See [here](https://www.tidymodels.org/find/parsnip/) for a list of models that `parsnip` currently supports.\n\n## Libraries\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-2_f1b70260de9274ceee0de0930c458b3d'}\n\n```{.r .cell-code}\nlibrary(tidymodels)\nlibrary(broom)\nlibrary(performance)\n```\n:::\n\n\n## Simple linear regression\n\nThe key steps to perform linear regression in `tidymodels` are to first specify the model type and then to specify the model form and the data to be used to construct it.\n\nTo illustrate, we shall look to `penguins` dataset from the `tidymodels`' `modeldata` package. This dataset contains measurements for 344 penguins from three islands in Palmer Archipelago, Antarctica, and includes information on their species, island home, size (flipper length, body mass, bill dimensions), and sex.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-3_473e0ea4ec6d47826afc1c168bb38198'}\n::: {.cell-output-display}\n![](img/palmer_penguin_species.png){fig-align='center' width=75%}\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-4_aa7c3f7c67794e4868c4a9be11dcebfd'}\n\n```{.r .cell-code}\n# Let's inspect the data\nhead(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species island bill_length_mm bill_depth_mm flipper_length_mm\n#> \n#> 1 Adelie Torgersen 39.1 18.7 181\n#> 2 Adelie Torgersen 39.5 17.4 186\n#> 3 Adelie Torgersen 40.3 18 195\n#> 4 Adelie Torgersen NA NA NA\n#> 5 Adelie Torgersen 36.7 19.3 193\n#> 6 Adelie Torgersen 39.3 20.6 190\n#> # ℹ 2 more variables: body_mass_g , sex \n```\n:::\n:::\n\n\nOne thing you may have spotted is that there's missing data in this dataset in the fourth row. For simplicity, we will only work with the complete cases. This reduces the number of rows in our dataset to 333.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-5_6c5cb7769e50f22fea43f07d1fca5e94'}\n\n```{.r .cell-code}\npenguins <- penguins %>%\n filter(complete.cases(.))\n\nhead(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species island bill_length_mm bill_depth_mm flipper_length_mm\n#> \n#> 1 Adelie Torgersen 39.1 18.7 181\n#> 2 Adelie Torgersen 39.5 17.4 186\n#> 3 Adelie Torgersen 40.3 18 195\n#> 4 Adelie Torgersen 36.7 19.3 193\n#> 5 Adelie Torgersen 39.3 20.6 190\n#> 6 Adelie Torgersen 38.9 17.8 181\n#> # ℹ 2 more variables: body_mass_g , sex \n```\n:::\n:::\n\n\nMuch better! We will now build a simple linear regression model to model bill length as a function of bill depth.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-6_9752f79826c0bd2c0727f0d373aa20ff'}\n::: {.cell-output-display}\n![](img/bill_length_depth.png){fig-align='center' width=60%}\n:::\n:::\n\n\nIn `parsnip`, the model specification is broken down into small functions such as `set_mode()` and `set_engine()` to make the interface more flexible and readable. The general structure is to first specify a mode (regression or classification) and then an engine to indicate what software (or implementation of the algorithm) will be used to fit the model. For our purposes, the mode is `regression` and the engine is `lm` for ordinary least squares. You may note that setting the mode is unnecessary for linear regression, but we include it here as it is a good practice.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-7_d894b718918a6c425b9e0fff7ad8299b'}\n\n```{.r .cell-code}\nlm_spec <- linear_reg() %>%\n set_mode(\"regression\") %>%\n set_engine(\"lm\")\n```\n:::\n\n\nThe above specification does not actually carry out the regression, rather it just states what we would like to do.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-8_dbdab6875a55b8152227310053c8cadd'}\n\n```{.r .cell-code}\nlm_spec\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Linear Regression Model Specification (regression)\n#> \n#> Computational engine: lm\n```\n:::\n:::\n\n\nOnce we have such a blueprint, we may fit a model by inputting data and a formula. Recall that in R, a formula takes the form `y ~ x` where `y` ix the response and `x` is the predictor variable. For our example, where the response of bill length and predictor of bill depth, we would write the formula as `bill_length_mm ~ bill_depth_mm`. \n\n::: {.callout-note}\nUnlike with standard R `formula()` objects, the names used this a formula must \nbe identical to the variable names in the dataset. No processing functions\nare allowed (processing is handled by the `recipe()`).\n:::\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-9_4435e7400b6dd465d61c248470b8ec32'}\n\n```{.r .cell-code}\nlm_fit <- lm_spec %>%\n fit(bill_length_mm ~ bill_depth_mm, data = penguins)\n\nlm_fit\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm, data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm \n#> 54.8909 -0.6349\n```\n:::\n:::\n\n\nThe resulting `parsnip` object includes basic information about the fit such as the model coefficients. To access the underlying fit object, we could use the standard `lm_fit$fit` or with `purrr`'s `pluck()` function.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-10_c44cea2b4ccca541a75e2249a6991015'}\n\n```{.r .cell-code}\nlm_fit %>%\n pluck(\"fit\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm, data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm \n#> 54.8909 -0.6349\n```\n:::\n:::\n\n\nTo get additional information about the fit (such as standard errors, and goodness-of-fit statistics), we can get a summary of the model fit as follows:\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-11_170fec411bf1e01a30f67a906726bc21'}\n\n```{.r .cell-code}\nlm_fit %>%\n pluck(\"fit\") %>%\n summary()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm, data = data)\n#> \n#> Residuals:\n#> Min 1Q Median 3Q Max \n#> -12.9498 -3.9530 -0.3657 3.7327 15.5025 \n#> \n#> Coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 54.8909 2.5673 21.380 < 2e-16 ***\n#> bill_depth_mm -0.6349 0.1486 -4.273 2.53e-05 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Residual standard error: 5.332 on 331 degrees of freedom\n#> Multiple R-squared: 0.05227,\tAdjusted R-squared: 0.04941 \n#> F-statistic: 18.26 on 1 and 331 DF, p-value: 2.528e-05\n```\n:::\n:::\n\n\nTo get a tidy summary of the model parameter estimates, simply use the tidy function from the [broom](https://broom.tidymodels.org/) package on the model fit. To extract model statistics, `glance()` can be used.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-12_3c6cda9ae2eea12f6c255b8bf2cd5061'}\n\n```{.r .cell-code}\ntidy(lm_fit)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 2 × 5\n#> term estimate std.error statistic p.value\n#> \n#> 1 (Intercept) 54.9 2.57 21.4 2.54e-64\n#> 2 bill_depth_mm -0.635 0.149 -4.27 2.53e- 5\n```\n:::\n\n```{.r .cell-code}\nglance(lm_fit)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 1 × 12\n#> r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC\n#> \n#> 1 0.0523 0.0494 5.33 18.3 0.0000253 1 -1029. 2064. 2075.\n#> # ℹ 3 more variables: deviance , df.residual , nobs \n```\n:::\n:::\n\n\nNow, to make predictions, we simply use `predict()` on the parnsip model object. In there, we must specify the dataset we want to predict on in the `new_data` argument. Note that this may be a different dataset than we used for fitting the model, but this input data must include all predictor variables that were used to fit the model.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-13_0370f690b7e270998042a729bdbf587f'}\n\n```{.r .cell-code}\npredict(lm_fit, new_data = penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 1\n#> .pred\n#> \n#> 1 43.0\n#> 2 43.8\n#> 3 43.5\n#> 4 42.6\n#> 5 41.8\n#> 6 43.6\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nFor parnsip models, the predictions are always outputted in a tibble.\n\nTo specify the type of prediction made, modify `type` argument. If we set `type = \"conf_int\"`, we get a 95% confidence interval.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-14_ef093136394a01ed5d663b2ea837c983'}\n\n```{.r .cell-code}\npredict(lm_fit, new_data = penguins, type = \"conf_int\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 42.3 43.7\n#> 2 43.3 44.4\n#> 3 42.8 44.1\n#> 4 41.8 43.5\n#> 5 40.7 43.0\n#> 6 43.0 44.2\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nTo evaluate model predictive performance, it is logical to compare the each of the observed and predicted values. To see these values side-by-side we simply bind the two vectors of interest.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-15_e464c86a9e8050cfa6b8e3174a99d1ee'}\n\n```{.r .cell-code}\nbind_cols(\n predict(lm_fit, new_data = penguins),\n penguins\n) %>%\n select(bill_length_mm, .pred)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 2\n#> bill_length_mm .pred\n#> \n#> 1 39.1 43.0\n#> 2 39.5 43.8\n#> 3 40.3 43.5\n#> 4 36.7 42.6\n#> 5 39.3 41.8\n#> 6 38.9 43.6\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nA simpler way to do this is to use the nifty `augment()` function.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-16_008a078253bb995f610dd7929c7c4874'}\n\n```{.r .cell-code}\naugment(lm_fit, new_data = penguins) %>%\n select(bill_length_mm, .pred)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 2\n#> bill_length_mm .pred\n#> \n#> 1 39.1 43.0\n#> 2 39.5 43.8\n#> 3 40.3 43.5\n#> 4 36.7 42.6\n#> 5 39.3 41.8\n#> 6 38.9 43.6\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\n## Multiple linear regression\n\nThe only difference about fitting a multiple linear regression model in comparison to a simple linear regression model lies the formula. For multiple linear regression, the predictors are specified in the formula expression, separated by `+`. For example, if we have a response variable `y` and three predictors, `x1, x2,` and `x3`, we would write the formula as, `y ~ x1 + x2 + x3`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-17_77a1ab952443165abb55d9b9fdae419e'}\n\n```{.r .cell-code}\nlm_fit2 <- lm_spec %>% fit(\n formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm + body_mass_g,\n data = penguins\n)\nlm_fit2\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm + \n#> body_mass_g, data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm flipper_length_mm body_mass_g \n#> -2.571e+01 6.131e-01 2.872e-01 3.472e-04\n```\n:::\n:::\n\n\nEverything else proceeds much the same as before. Such as obtaining parameter estimates\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-18_2f633c83ebc971b730b1364d8a51e402'}\n\n```{.r .cell-code}\ntidy(lm_fit2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 4 × 5\n#> term estimate std.error statistic p.value\n#> \n#> 1 (Intercept) -25.7 6.72 -3.83 1.55e- 4\n#> 2 bill_depth_mm 0.613 0.138 4.43 1.26e- 5\n#> 3 flipper_length_mm 0.287 0.0351 8.18 6.28e-15\n#> 4 body_mass_g 0.000347 0.000566 0.614 5.40e- 1\n```\n:::\n:::\n\n\nas well as predicting new values.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-19_bade4f9eb41832a1bfdb0efeae87f940'}\n\n```{.r .cell-code}\npredict(lm_fit2, new_data = penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 1\n#> .pred\n#> \n#> 1 39.0\n#> 2 39.7\n#> 3 42.5\n#> 4 42.8\n#> 5 42.8\n#> 6 38.4\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nIf you would like to use all variables aside from your response as predictors, a shortcut is to use the formula form `y ~ .`\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-20_4a1fb1bf15b09cf0cb3b73cb1f3a7c72'}\n\n```{.r .cell-code}\nlm_fit3 <- lm_spec %>% fit(bill_length_mm ~ ., data = penguins)\nlm_fit3\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) speciesChinstrap speciesGentoo islandDream \n#> 15.343291 9.835502 6.117675 -0.503815 \n#> islandTorgersen bill_depth_mm flipper_length_mm body_mass_g \n#> -0.127431 0.300670 0.069257 0.001081 \n#> sexmale \n#> 2.047859\n```\n:::\n:::\n\n\n## Checking model assumptions\n\nAfter fitting a model, it is good to check whether the assumptions of linear regression are met. For this, we will use the `performance` package, in particular the `check_model()` function to produce several helpful plots we may use to check the assumptions for our first multiple linear regression model.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-21_9f0e50621ab07bb69b919f860e7a0ba9'}\n\n```{.r .cell-code}\nlm_fit2 %>%\n extract_fit_engine() %>%\n check_model()\n```\n\n::: {.cell-output-display}\n![](tidymodels-regression_files/figure-html/unnamed-chunk-21-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nNotice that on each plot it says what we should expect to see if the model assumption is met.\n\nWe shall now briefly walk you through what each plot means.\n\nThe first two plots help us to examine the linearity of the errors versus the fitted values. Ideally, we want this error to be relatively flat and horizontal. The third plot is for checking homogeneity of the variance, where we want the points to be roughly the same distance from the line as this indicates similar dispersion. The fourth plot helps us to see if there are high leverage points - points that have command or influence over the model fit. As a result, these can have a great effect on the model predictions. So the removal of such points or modifications to the model may be necessary to deal with them. The fifth plot helps us to discern collinearity, which is when predictors are highly correlated. Since independent variables should be independent, this can throw off simple regression models (in standard error of coefficient estimates and the estimates themselves, which would likely be sensitive to changes in the predictors that are included in the model). The last plot enables us to check the normality of residuals. If the distribution of the model error is non-normal, then that suggests a linear model may not be appropriate. For a QQ plot, we want the points to fall along a straight diagonal line.\n\nFor our example, we observe that there's a pretty high correlation between `body_mass_g` and `flipper_length_mm` (not quite in the red-zone of 10 and above, but close enough for concern). That is indicative of multicollinearity between them. Intuitively, it makes sense for the body mass and flipper length variables - we'd expect that as once increases, so should the other.\n\nWe can take a closer look at the correlation by whipping up a correlation matrix by using base R's `cor()` function. Since for collinearity we're only usually interested in the numerical predictors, we'll only include the four numeric variables.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-22_49987b72aff741a32f5dfc50558d4948'}\n\n```{.r .cell-code}\npenguins_corr <- penguins %>%\n select(body_mass_g, ends_with(\"_mm\")) %>%\n cor()\npenguins_corr\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> body_mass_g bill_length_mm bill_depth_mm flipper_length_mm\n#> body_mass_g 1.0000000 0.5894511 -0.4720157 0.8729789\n#> bill_length_mm 0.5894511 1.0000000 -0.2286256 0.6530956\n#> bill_depth_mm -0.4720157 -0.2286256 1.0000000 -0.5777917\n#> flipper_length_mm 0.8729789 0.6530956 -0.5777917 1.0000000\n```\n:::\n:::\n\n\nIndeed `body_mass_g` and `flipper_length_mm` are highly positively correlated. To deal with this problem, we'll re-fit the model without `body_mass_g`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-23_306a0e9cc19412e08433bd04830dd697'}\n\n```{.r .cell-code}\nlm_fit3 <- lm_spec %>% fit(\n formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm,\n data = penguins\n)\nlm_fit3\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm flipper_length_mm \n#> -27.9762 0.6200 0.3052\n```\n:::\n:::\n\n\nand then check again to see whether the assumptions are met.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-24_6508d43288ce1be6d4caff814eec9b9f'}\n\n```{.r .cell-code}\nlm_fit3 %>%\n extract_fit_engine() %>%\n check_model()\n```\n\n::: {.cell-output-display}\n![](tidymodels-regression_files/figure-html/unnamed-chunk-24-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nOverall, the plots look pretty good. For details on how to interpret each of these plots and more details about model assumptions please see [here](https://easystats.github.io/see/articles/performance.html) and [here](https://rdrr.io/cran/performance/man/check_model.html).\n\n## Interaction terms\n\nIn general, the syntax to add an interaction term to a formula is as follows:\n\n- `x:y` denotes an interaction term between `x` and `y`.\n- `x*y` denotes the interaction between `x` and `y` as well as `x` and `y`; that is, `x + y + x*y`.\n\nIt is important to note that this syntax is not compatible with all engines. Thus, we shall explain how to bypass this issue by adding an interaction term in a recipe later on. For now, let's start simple by adding an interaction term between `species` and `bill_length_mm`, which allows for a species-specific slope.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-25_be09e39914328ca51bbe6fb2207c82b1'}\n\n```{.r .cell-code}\nlm_fit4 <- lm_spec %>% fit(\n formula = bill_length_mm ~ species * bill_depth_mm,\n data = penguins\n)\nlm_fit4\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ species * bill_depth_mm, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) speciesChinstrap \n#> 23.3668 -9.9389 \n#> speciesGentoo bill_depth_mm \n#> -6.6966 0.8425 \n#> speciesChinstrap:bill_depth_mm speciesGentoo:bill_depth_mm \n#> 1.0796 1.2178\n```\n:::\n:::\n\n\nUsing recipes, the interaction term is specified by using `step_interact()`. Then we construct a workflow object, where we add the linear regression model specification and recipe. Finally, we fit the model as we did for a `parsnip` model. Note that the workflow object does not need the variables that were specified in the recipe to be specified again.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-26_5d6b5266d0b373c844a38e0a080436d0'}\n\n```{.r .cell-code}\nrec_spec_interact <- recipe(\n formula = bill_length_mm ~ species + bill_depth_mm,\n data = penguins\n) %>%\n step_interact(~ species:bill_depth_mm)\n\nlm_wf_interact <- workflow() %>%\n add_model(lm_spec) %>%\n add_recipe(rec_spec_interact)\n\nlm_wf_interact %>% fit(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 1 Recipe Step\n#> \n#> • step_interact()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) speciesChinstrap \n#> 23.3668 -9.9389 \n#> speciesGentoo bill_depth_mm \n#> -6.6966 0.8425 \n#> speciesChinstrap_x_bill_depth_mm speciesGentoo_x_bill_depth_mm \n#> 1.0796 1.2178\n```\n:::\n:::\n\n\nNotice the variable name for the interaction term is not the same as it is in base R (which is simply of the form `x:y`). In `step_interact()`, the default separator between the variable names is `_x_`. You can change this default by specifying the `sep` argument in the function.\n\nTo read more about formula syntax, see [?formula](https://rdrr.io/r/stats/formula.html).\n\n## Non-linear transformations of the predictors\n\nSimilar to how we were able to add an interaction term using recipes, we can also perform a transformation as a pre-processing step. The function used for this is `step_mutate()` (which acts like `dplyr`'s `mutate`).\n\nNote that, in general, if you are specifying a recipe aim to keep as much of the pre-processing in your recipe specification as possible. This helps to ensure that the transformation will be applied to new data consistently.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-27_b7d9eb46155f4d760ade7a95ee6ba5d7'}\n\n```{.r .cell-code}\nrec_spec_pow2 <- recipe(bill_length_mm ~ bill_depth_mm, data = penguins) %>%\n step_mutate(bill_depth_mm2 = bill_depth_mm^2)\n\nlm_wf_pow2 <- workflow() %>%\n add_model(lm_spec) %>%\n add_recipe(rec_spec_pow2)\n\nlm_wf_pow2 %>% fit(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 1 Recipe Step\n#> \n#> • step_mutate()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm bill_depth_mm2 \n#> 95.2558 -5.4431 0.1413\n```\n:::\n:::\n\n\nThere are many transformations already built into recipes such as `step_log()`. So, for basic transformations, there's often no need to make your own transformation from scratch. See [here](https://recipes.tidymodels.org/reference/#section-step-functions-individual-transformations) for a comprehensive list of the transformations that are offered in recipes.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-28_53a92b75fcd1ad219c04c0fcc94425be'}\n\n```{.r .cell-code}\nrec_spec_log <- recipe(bill_length_mm ~ bill_depth_mm, data = penguins) %>%\n step_log(bill_depth_mm) # transforms the var in-place, keeps it's name\n\nlm_wf_log <- workflow() %>%\n add_model(lm_spec) %>%\n add_recipe(rec_spec_log)\n\nlm_wf_log %>% fit(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 1 Recipe Step\n#> \n#> • step_log()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm \n#> 74.95 -10.91\n```\n:::\n:::\n\n\n\\\n\\\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n\n## Attribution\n\nThis Chapter was largely adapted from [Chapter 3 of ISLR tidymodels labs](https://emilhvitfeldt.github.io/ISLR-tidymodels-labs/03-linear-regression.html). Checking linear regression assumptions using the performance package is based on [this article](https://easystats.github.io/performance/reference/check_model.html) and [this blog post](https://www.r-bloggers.com/2021/07/easystats-quickly-investigate-model-performance/) on investigating model performance. The artwork used is by [Allison Horst](https://twitter.com/allison_horst).[Allison Horst](https://twitter.com/allison_horst).\n\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n", + "markdown": "# Regression in Tidymodels\n\n\n::: {.cell}\n\n:::\n\n\nThis vignette is a gentle introduction into performing simple and multiple linear regression using `tidymodels`. Model fitting will be done using [parsnip](https://www.tidymodels.org/start/models/), which provides a unifying interface for model fitting and the resulting output. This means that parsnip provides a single interface with standardized argument names for each class of models so that you don't have to directly deal with the different interfaces for different functions that aim to do the same thing (like linear regression). See [here](https://www.tidymodels.org/find/parsnip/) for a list of models that `parsnip` currently supports.\n\n## Libraries\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-2_f1b70260de9274ceee0de0930c458b3d'}\n\n```{.r .cell-code}\nlibrary(tidymodels)\nlibrary(broom)\nlibrary(performance)\n```\n:::\n\n\n## Simple linear regression\n\nThe key steps to perform linear regression in `tidymodels` are to first specify the model type and then to specify the model form and the data to be used to construct it.\n\nTo illustrate, we shall look to `penguins` dataset from the `tidymodels`' `modeldata` package. This dataset contains measurements for 344 penguins from three islands in Palmer Archipelago, Antarctica, and includes information on their species, island home, size (flipper length, body mass, bill dimensions), and sex.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-3_473e0ea4ec6d47826afc1c168bb38198'}\n::: {.cell-output-display}\n![](img/palmer_penguin_species.png){fig-align='center' width=75%}\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-4_aa7c3f7c67794e4868c4a9be11dcebfd'}\n\n```{.r .cell-code}\n# Let's inspect the data\nhead(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species island bill_length_mm bill_depth_mm flipper_length_mm\n#> \n#> 1 Adelie Torgersen 39.1 18.7 181\n#> 2 Adelie Torgersen 39.5 17.4 186\n#> 3 Adelie Torgersen 40.3 18 195\n#> 4 Adelie Torgersen NA NA NA\n#> 5 Adelie Torgersen 36.7 19.3 193\n#> 6 Adelie Torgersen 39.3 20.6 190\n#> # ℹ 2 more variables: body_mass_g , sex \n```\n:::\n:::\n\n\nOne thing you may have spotted is that there's missing data in this dataset in the fourth row. For simplicity, we will only work with the complete cases. This reduces the number of rows in our dataset to 333.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-5_6c5cb7769e50f22fea43f07d1fca5e94'}\n\n```{.r .cell-code}\npenguins <- penguins %>%\n filter(complete.cases(.))\n\nhead(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 6 × 7\n#> species island bill_length_mm bill_depth_mm flipper_length_mm\n#> \n#> 1 Adelie Torgersen 39.1 18.7 181\n#> 2 Adelie Torgersen 39.5 17.4 186\n#> 3 Adelie Torgersen 40.3 18 195\n#> 4 Adelie Torgersen 36.7 19.3 193\n#> 5 Adelie Torgersen 39.3 20.6 190\n#> 6 Adelie Torgersen 38.9 17.8 181\n#> # ℹ 2 more variables: body_mass_g , sex \n```\n:::\n:::\n\n\nMuch better! We will now build a simple linear regression model to model bill length as a function of bill depth.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-6_9752f79826c0bd2c0727f0d373aa20ff'}\n::: {.cell-output-display}\n![](img/bill_length_depth.png){fig-align='center' width=60%}\n:::\n:::\n\n\nIn `parsnip`, the model specification is broken down into small functions such as `set_mode()` and `set_engine()` to make the interface more flexible and readable. The general structure is to first specify a mode (regression or classification) and then an engine to indicate what software (or implementation of the algorithm) will be used to fit the model. For our purposes, the mode is `regression` and the engine is `lm` for ordinary least squares. You may note that setting the mode is unnecessary for linear regression, but we include it here as it is a good practice.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-7_d894b718918a6c425b9e0fff7ad8299b'}\n\n```{.r .cell-code}\nlm_spec <- linear_reg() %>%\n set_mode(\"regression\") %>%\n set_engine(\"lm\")\n```\n:::\n\n\nThe above specification does not actually carry out the regression, rather it just states what we would like to do.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-8_dbdab6875a55b8152227310053c8cadd'}\n\n```{.r .cell-code}\nlm_spec\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Linear Regression Model Specification (regression)\n#> \n#> Computational engine: lm\n```\n:::\n:::\n\n\nOnce we have such a blueprint, we may fit a model by inputting data and a formula. Recall that in R, a formula takes the form `y ~ x` where `y` ix the response and `x` is the predictor variable. For our example, where the response of bill length and predictor of bill depth, we would write the formula as `bill_length_mm ~ bill_depth_mm`. \n\n::: {.callout-note}\nUnlike with standard R `formula()` objects, the names used this a formula must \nbe identical to the variable names in the dataset. No processing functions\nare allowed (processing is handled by the `recipe()`).\n:::\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-9_4435e7400b6dd465d61c248470b8ec32'}\n\n```{.r .cell-code}\nlm_fit <- lm_spec %>%\n fit(bill_length_mm ~ bill_depth_mm, data = penguins)\n\nlm_fit\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm, data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm \n#> 54.8909 -0.6349\n```\n:::\n:::\n\n\nThe resulting `parsnip` object includes basic information about the fit such as the model coefficients. To access the underlying fit object, we could use the standard `lm_fit$fit` or with `purrr`'s `pluck()` function.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-10_c44cea2b4ccca541a75e2249a6991015'}\n\n```{.r .cell-code}\nlm_fit %>%\n pluck(\"fit\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm, data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm \n#> 54.8909 -0.6349\n```\n:::\n:::\n\n\nTo get additional information about the fit (such as standard errors, and goodness-of-fit statistics), we can get a summary of the model fit as follows:\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-11_170fec411bf1e01a30f67a906726bc21'}\n\n```{.r .cell-code}\nlm_fit %>%\n pluck(\"fit\") %>%\n summary()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm, data = data)\n#> \n#> Residuals:\n#> Min 1Q Median 3Q Max \n#> -12.9498 -3.9530 -0.3657 3.7327 15.5025 \n#> \n#> Coefficients:\n#> Estimate Std. Error t value Pr(>|t|) \n#> (Intercept) 54.8909 2.5673 21.380 < 2e-16 ***\n#> bill_depth_mm -0.6349 0.1486 -4.273 2.53e-05 ***\n#> ---\n#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#> \n#> Residual standard error: 5.332 on 331 degrees of freedom\n#> Multiple R-squared: 0.05227,\tAdjusted R-squared: 0.04941 \n#> F-statistic: 18.26 on 1 and 331 DF, p-value: 2.528e-05\n```\n:::\n:::\n\n\nTo get a tidy summary of the model parameter estimates, simply use the tidy function from the [broom](https://broom.tidymodels.org/) package on the model fit. To extract model statistics, `glance()` can be used.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-12_3c6cda9ae2eea12f6c255b8bf2cd5061'}\n\n```{.r .cell-code}\ntidy(lm_fit)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 2 × 5\n#> term estimate std.error statistic p.value\n#> \n#> 1 (Intercept) 54.9 2.57 21.4 2.54e-64\n#> 2 bill_depth_mm -0.635 0.149 -4.27 2.53e- 5\n```\n:::\n\n```{.r .cell-code}\nglance(lm_fit)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 1 × 12\n#> r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC\n#> \n#> 1 0.0523 0.0494 5.33 18.3 0.0000253 1 -1029. 2064. 2075.\n#> # ℹ 3 more variables: deviance , df.residual , nobs \n```\n:::\n:::\n\n\nNow, to make predictions, we simply use `predict()` on the parnsip model object. In there, we must specify the dataset we want to predict on in the `new_data` argument. Note that this may be a different dataset than we used for fitting the model, but this input data must include all predictor variables that were used to fit the model.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-13_0370f690b7e270998042a729bdbf587f'}\n\n```{.r .cell-code}\npredict(lm_fit, new_data = penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 1\n#> .pred\n#> \n#> 1 43.0\n#> 2 43.8\n#> 3 43.5\n#> 4 42.6\n#> 5 41.8\n#> 6 43.6\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nFor parnsip models, the predictions are always outputted in a tibble.\n\nTo specify the type of prediction made, modify `type` argument. If we set `type = \"conf_int\"`, we get a 95% confidence interval.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-14_ef093136394a01ed5d663b2ea837c983'}\n\n```{.r .cell-code}\npredict(lm_fit, new_data = penguins, type = \"conf_int\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 42.3 43.7\n#> 2 43.3 44.4\n#> 3 42.8 44.1\n#> 4 41.8 43.5\n#> 5 40.7 43.0\n#> 6 43.0 44.2\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nTo evaluate model predictive performance, it is logical to compare the each of the observed and predicted values. To see these values side-by-side we simply bind the two vectors of interest.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-15_e464c86a9e8050cfa6b8e3174a99d1ee'}\n\n```{.r .cell-code}\nbind_cols(\n predict(lm_fit, new_data = penguins),\n penguins\n) %>%\n select(bill_length_mm, .pred)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 2\n#> bill_length_mm .pred\n#> \n#> 1 39.1 43.0\n#> 2 39.5 43.8\n#> 3 40.3 43.5\n#> 4 36.7 42.6\n#> 5 39.3 41.8\n#> 6 38.9 43.6\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nA simpler way to do this is to use the nifty `augment()` function.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-16_008a078253bb995f610dd7929c7c4874'}\n\n```{.r .cell-code}\naugment(lm_fit, new_data = penguins) %>%\n select(bill_length_mm, .pred)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 2\n#> bill_length_mm .pred\n#> \n#> 1 39.1 43.0\n#> 2 39.5 43.8\n#> 3 40.3 43.5\n#> 4 36.7 42.6\n#> 5 39.3 41.8\n#> 6 38.9 43.6\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\n## Multiple linear regression\n\nThe only difference about fitting a multiple linear regression model in comparison to a simple linear regression model lies the formula. For multiple linear regression, the predictors are specified in the formula expression, separated by `+`. For example, if we have a response variable `y` and three predictors, `x1, x2,` and `x3`, we would write the formula as, `y ~ x1 + x2 + x3`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-17_77a1ab952443165abb55d9b9fdae419e'}\n\n```{.r .cell-code}\nlm_fit2 <- lm_spec %>% fit(\n formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm + body_mass_g,\n data = penguins\n)\nlm_fit2\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm + \n#> body_mass_g, data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm flipper_length_mm body_mass_g \n#> -2.571e+01 6.131e-01 2.872e-01 3.472e-04\n```\n:::\n:::\n\n\nEverything else proceeds much the same as before. Such as obtaining parameter estimates\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-18_2f633c83ebc971b730b1364d8a51e402'}\n\n```{.r .cell-code}\ntidy(lm_fit2)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 4 × 5\n#> term estimate std.error statistic p.value\n#> \n#> 1 (Intercept) -25.7 6.72 -3.83 1.55e- 4\n#> 2 bill_depth_mm 0.613 0.138 4.43 1.26e- 5\n#> 3 flipper_length_mm 0.287 0.0351 8.18 6.28e-15\n#> 4 body_mass_g 0.000347 0.000566 0.614 5.40e- 1\n```\n:::\n:::\n\n\nas well as predicting new values.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-19_bade4f9eb41832a1bfdb0efeae87f940'}\n\n```{.r .cell-code}\npredict(lm_fit2, new_data = penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 333 × 1\n#> .pred\n#> \n#> 1 39.0\n#> 2 39.7\n#> 3 42.5\n#> 4 42.8\n#> 5 42.8\n#> 6 38.4\n#> # ℹ 327 more rows\n```\n:::\n:::\n\n\nIf you would like to use all variables aside from your response as predictors, a shortcut is to use the formula form `y ~ .`\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-20_4a1fb1bf15b09cf0cb3b73cb1f3a7c72'}\n\n```{.r .cell-code}\nlm_fit3 <- lm_spec %>% fit(bill_length_mm ~ ., data = penguins)\nlm_fit3\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) speciesChinstrap speciesGentoo islandDream \n#> 15.343291 9.835502 6.117675 -0.503815 \n#> islandTorgersen bill_depth_mm flipper_length_mm body_mass_g \n#> -0.127431 0.300670 0.069257 0.001081 \n#> sexmale \n#> 2.047859\n```\n:::\n:::\n\n\n## Checking model assumptions\n\nAfter fitting a model, it is good to check whether the assumptions of linear regression are met. For this, we will use the `performance` package, in particular the `check_model()` function to produce several helpful plots we may use to check the assumptions for our first multiple linear regression model.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-21_9f0e50621ab07bb69b919f860e7a0ba9'}\n\n```{.r .cell-code}\nlm_fit2 %>%\n extract_fit_engine() %>%\n check_model()\n```\n\n::: {.cell-output-display}\n![](tidymodels-regression_files/figure-html/unnamed-chunk-21-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nNotice that on each plot it says what we should expect to see if the model assumption is met.\n\nWe shall now briefly walk you through what each plot means.\n\nThe first two plots help us to examine the linearity of the errors versus the fitted values. Ideally, we want this error to be relatively flat and horizontal. The third plot is for checking homogeneity of the variance, where we want the points to be roughly the same distance from the line as this indicates similar dispersion. The fourth plot helps us to see if there are high leverage points - points that have command or influence over the model fit. As a result, these can have a great effect on the model predictions. So the removal of such points or modifications to the model may be necessary to deal with them. The fifth plot helps us to discern collinearity, which is when predictors are highly correlated. Since independent variables should be independent, this can throw off simple regression models (in standard error of coefficient estimates and the estimates themselves, which would likely be sensitive to changes in the predictors that are included in the model). The last plot enables us to check the normality of residuals. If the distribution of the model error is non-normal, then that suggests a linear model may not be appropriate. For a QQ plot, we want the points to fall along a straight diagonal line.\n\nFor our example, we observe that there's a pretty high correlation between `body_mass_g` and `flipper_length_mm` (not quite in the red-zone of 10 and above, but close enough for concern). That is indicative of multicollinearity between them. Intuitively, it makes sense for the body mass and flipper length variables - we'd expect that as once increases, so should the other.\n\nWe can take a closer look at the correlation by whipping up a correlation matrix by using base R's `cor()` function. Since for collinearity we're only usually interested in the numerical predictors, we'll only include the four numeric variables.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-22_49987b72aff741a32f5dfc50558d4948'}\n\n```{.r .cell-code}\npenguins_corr <- penguins %>%\n select(body_mass_g, ends_with(\"_mm\")) %>%\n cor()\npenguins_corr\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> body_mass_g bill_length_mm bill_depth_mm flipper_length_mm\n#> body_mass_g 1.0000000 0.5894511 -0.4720157 0.8729789\n#> bill_length_mm 0.5894511 1.0000000 -0.2286256 0.6530956\n#> bill_depth_mm -0.4720157 -0.2286256 1.0000000 -0.5777917\n#> flipper_length_mm 0.8729789 0.6530956 -0.5777917 1.0000000\n```\n:::\n:::\n\n\nIndeed `body_mass_g` and `flipper_length_mm` are highly positively correlated. To deal with this problem, we'll re-fit the model without `body_mass_g`.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-23_306a0e9cc19412e08433bd04830dd697'}\n\n```{.r .cell-code}\nlm_fit3 <- lm_spec %>% fit(\n formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm,\n data = penguins\n)\nlm_fit3\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ bill_depth_mm + flipper_length_mm, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm flipper_length_mm \n#> -27.9762 0.6200 0.3052\n```\n:::\n:::\n\n\nand then check again to see whether the assumptions are met.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-24_6508d43288ce1be6d4caff814eec9b9f'}\n\n```{.r .cell-code}\nlm_fit3 %>%\n extract_fit_engine() %>%\n check_model()\n```\n\n::: {.cell-output-display}\n![](tidymodels-regression_files/figure-html/unnamed-chunk-24-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nOverall, the plots look pretty good. For details on how to interpret each of these plots and more details about model assumptions please see [here](https://easystats.github.io/see/articles/performance.html) and [here](https://rdrr.io/cran/performance/man/check_model.html).\n\n## Interaction terms\n\nIn general, the syntax to add an interaction term to a formula is as follows:\n\n- `x:y` denotes an interaction term between `x` and `y`.\n- `x*y` denotes the interaction between `x` and `y` as well as `x` and `y`; that is, `x + y + x*y`.\n\nIt is important to note that this syntax is not compatible with all engines. Thus, we shall explain how to bypass this issue by adding an interaction term in a recipe later on. For now, let's start simple by adding an interaction term between `species` and `bill_length_mm`, which allows for a species-specific slope.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-25_be09e39914328ca51bbe6fb2207c82b1'}\n\n```{.r .cell-code}\nlm_fit4 <- lm_spec %>% fit(\n formula = bill_length_mm ~ species * bill_depth_mm,\n data = penguins\n)\nlm_fit4\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = bill_length_mm ~ species * bill_depth_mm, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) speciesChinstrap \n#> 23.3668 -9.9389 \n#> speciesGentoo bill_depth_mm \n#> -6.6966 0.8425 \n#> speciesChinstrap:bill_depth_mm speciesGentoo:bill_depth_mm \n#> 1.0796 1.2178\n```\n:::\n:::\n\n\nUsing recipes, the interaction term is specified by using `step_interact()`. Then we construct a workflow object, where we add the linear regression model specification and recipe. Finally, we fit the model as we did for a `parsnip` model. Note that the workflow object does not need the variables that were specified in the recipe to be specified again.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-26_5d6b5266d0b373c844a38e0a080436d0'}\n\n```{.r .cell-code}\nrec_spec_interact <- recipe(\n formula = bill_length_mm ~ species + bill_depth_mm,\n data = penguins\n) %>%\n step_interact(~ species:bill_depth_mm)\n\nlm_wf_interact <- workflow() %>%\n add_model(lm_spec) %>%\n add_recipe(rec_spec_interact)\n\nlm_wf_interact %>% fit(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 1 Recipe Step\n#> \n#> • step_interact()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) speciesChinstrap \n#> 23.3668 -9.9389 \n#> speciesGentoo bill_depth_mm \n#> -6.6966 0.8425 \n#> speciesChinstrap_x_bill_depth_mm speciesGentoo_x_bill_depth_mm \n#> 1.0796 1.2178\n```\n:::\n:::\n\n\nNotice the variable name for the interaction term is not the same as it is in base R (which is simply of the form `x:y`). In `step_interact()`, the default separator between the variable names is `_x_`. You can change this default by specifying the `sep` argument in the function.\n\nTo read more about formula syntax, see [?formula](https://rdrr.io/r/stats/formula.html).\n\n## Non-linear transformations of the predictors\n\nSimilar to how we were able to add an interaction term using recipes, we can also perform a transformation as a pre-processing step. The function used for this is `step_mutate()` (which acts like `dplyr`'s `mutate`).\n\nNote that, in general, if you are specifying a recipe aim to keep as much of the pre-processing in your recipe specification as possible. This helps to ensure that the transformation will be applied to new data consistently.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-27_b7d9eb46155f4d760ade7a95ee6ba5d7'}\n\n```{.r .cell-code}\nrec_spec_pow2 <- recipe(bill_length_mm ~ bill_depth_mm, data = penguins) %>%\n step_mutate(bill_depth_mm2 = bill_depth_mm^2)\n\nlm_wf_pow2 <- workflow() %>%\n add_model(lm_spec) %>%\n add_recipe(rec_spec_pow2)\n\nlm_wf_pow2 %>% fit(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 1 Recipe Step\n#> \n#> • step_mutate()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm bill_depth_mm2 \n#> 95.2558 -5.4431 0.1413\n```\n:::\n:::\n\n\nThere are many transformations already built into recipes such as `step_log()`. So, for basic transformations, there's often no need to make your own transformation from scratch. See [here](https://recipes.tidymodels.org/reference/#section-step-functions-individual-transformations) for a comprehensive list of the transformations that are offered in recipes.\n\n\n::: {.cell layout-align=\"center\" hash='tidymodels-regression_cache/html/unnamed-chunk-28_53a92b75fcd1ad219c04c0fcc94425be'}\n\n```{.r .cell-code}\nrec_spec_log <- recipe(bill_length_mm ~ bill_depth_mm, data = penguins) %>%\n step_log(bill_depth_mm) # transforms the var in-place, keeps it's name\n\nlm_wf_log <- workflow() %>%\n add_model(lm_spec) %>%\n add_recipe(rec_spec_log)\n\nlm_wf_log %>% fit(penguins)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Workflow [trained] ═══════════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 1 Recipe Step\n#> \n#> • step_log()\n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) bill_depth_mm \n#> 74.95 -10.91\n```\n:::\n:::\n\n\n\\\n\\\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n\n## Attribution\n\nThis Chapter was largely adapted from [Chapter 3 of ISLR tidymodels labs](https://emilhvitfeldt.github.io/ISLR-tidymodels-labs/03-linear-regression.html). Checking linear regression assumptions using the performance package is based on [this article](https://easystats.github.io/performance/reference/check_model.html) and [this blog post](https://www.r-bloggers.com/2021/07/easystats-quickly-investigate-model-performance/) on investigating model performance. The artwork used is by [Allison Horst](https://twitter.com/allison_horst).[Allison Horst](https://twitter.com/allison_horst).\n\n🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧 🐧\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/tidymodels-regression/figure-html/unnamed-chunk-21-1.svg b/_freeze/tidymodels-regression/figure-html/unnamed-chunk-21-1.svg index 903b3c5..28bf5e4 100644 --- a/_freeze/tidymodels-regression/figure-html/unnamed-chunk-21-1.svg +++ b/_freeze/tidymodels-regression/figure-html/unnamed-chunk-21-1.svgdiff --git a/_freeze/tidymodels-regression/figure-html/unnamed-chunk-24-1.svg b/_freeze/tidymodels-regression/figure-html/unnamed-chunk-24-1.svg index 1e852ef..b8a31c9 100644 --- a/_freeze/tidymodels-regression/figure-html/unnamed-chunk-24-1.svg +++ b/_freeze/tidymodels-regression/figure-html/unnamed-chunk-24-1.svgdiff --git a/archive.qmd b/archive.qmd index 3f942c2..d873f2c 100644 --- a/archive.qmd +++ b/archive.qmd @@ -211,8 +211,8 @@ When merging archives, unless the archives have identical data release patterns, ```{r, message = FALSE, warning = FALSE,eval=FALSE} # This code is for illustration and doesn't run. # The result is saved/loaded in the (hidden) next chunk from `{epidatasets}` -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", @@ -220,7 +220,6 @@ y <- covidcast( geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% as_epi_archive(compactify = TRUE) @@ -244,4 +243,4 @@ pointer aliasing, and pointer reseating is possible. ::: {.callout-note} TODO: need a simple example here. -::: \ No newline at end of file +::: diff --git a/epidf.qmd b/epidf.qmd index eac23c8..73d17bc 100644 --- a/epidf.qmd +++ b/epidf.qmd @@ -31,14 +31,14 @@ library(epidatr) library(epiprocess) library(withr) -cases <- covidcast( - data_source = "jhu-csse", +cases <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_cumulative_num", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20220131), geo_values = "ca,fl,ny,tx" -) %>% fetch() +) colnames(cases) ``` diff --git a/epipredict.qmd b/epipredict.qmd index 6c6b9b5..3e3ac58 100644 --- a/epipredict.qmd +++ b/epipredict.qmd @@ -164,7 +164,7 @@ Another property of the basic model is the predictive interval. We describe this ```{r differential-levels} out_q <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), args_list = arx_args_list( - levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99)) + quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99)) ) ``` @@ -188,7 +188,7 @@ Additional simple adjustments to the basic forecaster can be made using the func ```{r, eval = FALSE} arx_args_list( lags = c(0L, 7L, 14L), ahead = 7L, n_training = Inf, - forecast_date = NULL, target_date = NULL, levels = c(0.05, 0.95), + forecast_date = NULL, target_date = NULL, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value" ) ``` diff --git a/flatline-forecaster.qmd b/flatline-forecaster.qmd index 4b76353..59bc0ff 100644 --- a/flatline-forecaster.qmd +++ b/flatline-forecaster.qmd @@ -55,15 +55,15 @@ five_days_ahead <- flatline_forecaster( five_days_ahead ``` -We could also specify that we want a 80% predictive interval by changing the -levels. The default 0.05 and 0.95 levels/quantiles give us 90% predictive -interval. +We could also specify that we want a 80% predictive interval by changing the +quantile levels. The default 0.05 and 0.95 levels/quantiles give us 90% +predictive intervals. ```{r} five_days_ahead <- flatline_forecaster( jhu, outcome = "death_rate", - flatline_args_list(ahead = 5L, levels = c(0.1, 0.9)) + flatline_args_list(ahead = 5L, quantile_levels = c(0.1, 0.9)) ) five_days_ahead @@ -72,9 +72,15 @@ five_days_ahead To see the other arguments that you may modify, please see `?flatline_args_list()`. For now, we will move on to looking at the workflow. ```{r} +#| results: false five_days_ahead$epi_workflow ``` +```{r} +#| echo: false +with_messages_cat_to_stdout(print(five_days_ahead$epi_workflow)) +``` + The fitted model here was based on minimal pre-processing of the data, estimating a flatline model, and then post-processing the results to be meaningful for epidemiological tasks. To look deeper into the pre-processing, @@ -88,10 +94,7 @@ extract_preprocessor(five_days_ahead$epi_workflow) ```{r} #| echo: false -#| results: asis -#| message: true -#| collapse: true -extract_preprocessor(five_days_ahead$epi_workflow) +with_messages_cat_to_stdout(print(extract_preprocessor(five_days_ahead$epi_workflow))) ``` @@ -110,10 +113,7 @@ extract_frosting(five_days_ahead$epi_workflow) ```{r} #| echo: false -#| collapse: true -#| results: false -#| message: true -extract_frosting(five_days_ahead$epi_workflow) +with_messages_cat_to_stdout(print(extract_frosting(five_days_ahead$epi_workflow))) ``` @@ -139,9 +139,7 @@ hist <- jhu %>% preds <- five_days_ahead$predictions %>% filter(geo_value %in% samp_geos) %>% - mutate(q = nested_quantiles(.pred_distn)) %>% - unnest(q) %>% - pivot_wider(names_from = tau, values_from = q) + pivot_quantiles_wider(.pred_distn) ggplot(hist, aes(color = geo_value)) + geom_line(aes(time_value, death_rate)) + @@ -175,9 +173,7 @@ Then, we proceed as we did before. The only difference from before is that we're #| code-fold: true preds <- out_df %>% filter(geo_value %in% samp_geos) %>% - mutate(q = nested_quantiles(.pred_distn)) %>% - unnest(q) %>% - pivot_wider(names_from = tau, values_from = q) + pivot_quantiles_wider(.pred_distn) ggplot(hist) + geom_line(aes(time_value, death_rate)) + diff --git a/forecast-framework.qmd b/forecast-framework.qmd index 519d31d..ddffd38 100644 --- a/forecast-framework.qmd +++ b/forecast-framework.qmd @@ -66,10 +66,8 @@ extract_recipe(out_gb$epi_workflow) ``` ```{r} -#| message: true #| echo: false -#| collapse: true -extract_recipe(out_gb$epi_workflow) +with_messages_cat_to_stdout(print(extract_recipe(out_gb$epi_workflow))) ``` @@ -144,10 +142,7 @@ extract_frosting(out_gb$epi_workflow) ```{r} #| echo: false -#| results: asis -#| message: true -#| collapse: true -extract_frosting(out_gb$epi_workflow) +with_messages_cat_to_stdout(print(extract_frosting(out_gb$epi_workflow))) ``` @@ -162,7 +157,7 @@ intervals at 0. The code to do this (inside the forecaster) is f <- frosting() %>% layer_predict() %>% layer_residual_quantiles( - probs = c(.01, .025, seq(.05, .95, by = .05), .975, .99), + quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99), symmetrize = TRUE) %>% layer_add_forecast_date() %>% layer_add_target_date() %>% diff --git a/index.qmd b/index.qmd index d3aea39..3ad222a 100644 --- a/index.qmd +++ b/index.qmd @@ -211,9 +211,7 @@ hist <- jhu %>% time_value >= max(time_value) - 90L) preds <- two_week_ahead$predictions %>% filter(geo_value %in% samp_geos) %>% - mutate(q = nested_quantiles(.pred_distn)) %>% - unnest(q) %>% - pivot_wider(names_from = tau, values_from = q) + pivot_quantiles_wider(.pred_distn) ggplot(hist, aes(color = geo_value)) + geom_line(aes(time_value, death_rate)) + diff --git a/packages.bib b/packages.bib index 7721d79..b4c56f8 100644 --- a/packages.bib +++ b/packages.bib @@ -25,15 +25,14 @@ @Manual{R-epidatasets @Manual{R-epidatr, title = {epidatr: Client for Delphi's Epidata API}, - author = {Jacob Bien and Logan Brooks and David Farrow and Pedrito Maynard-Zhang and Alex Reinhart and Ryan Tibshirani and Samuel Gratzl}, - note = {R package version 0.6.0}, - url = {https://github.com/cmu-delphi/epidatr}, + author = {Logan Brooks and Dmitry Shemetov and Samuel Gratzl}, year = {2023}, + note = {https://cmu-delphi.github.io/epidatr/, https://cmu-delphi.github.io/delphi-epidata/}, } @Manual{R-epipredict, title = {epipredict: Basic epidemiology forecasting methods}, - author = {Daniel McDonald and Ryan Tibshirani and Logan Brooks and Rachel Lobay and Maggie Liu and Ken Mawer and Chloe You}, + author = {Daniel McDonald and Ryan Tibshirani and Logan Brooks and Rachel Lobay}, note = {https://github.com/cmu-delphi/epipredict/}, year = {2023}, } @@ -41,7 +40,7 @@ @Manual{R-epipredict @Manual{R-epiprocess, title = {epiprocess: Tools for basic signal processing in epidemiology}, author = {Logan Brooks and Daniel McDonald and Evan Ray and Ryan Tibshirani}, - note = {R package version 0.6.0.9999}, + note = {R package version 0.7.0.9999}, url = {https://cmu-delphi.github.io/epiprocess/}, year = {2023}, } @@ -124,6 +123,12 @@ @Manual{R-tidyverse year = {2023}, note = {R package version 2.0.0}, url = {https://CRAN.R-project.org/package=tidyverse}, + +@Misc{epidatr2015, + title = {Delphi Epidata API}, + author = {David C. Farrow and Logan C. Brooks and Aaron Rumack and Ryan J. Tibshirani and Roni Rosenfeld}, + year = {2015}, + url = {https://github.com/cmu-delphi/delphi-epidata}, } @Book{ggplot22016, diff --git a/preprocessing-and-models.qmd b/preprocessing-and-models.qmd index bb161d4..812421c 100644 --- a/preprocessing-and-models.qmd +++ b/preprocessing-and-models.qmd @@ -63,24 +63,22 @@ package. #| eval: false #| code-fold: true geos <- c("ca", "fl", "tx", "ny", "nj") -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = geos) %>% - fetch() %>% select(geo_value, time_value, cases = value) -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "deaths_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = geos) %>% - fetch() %>% select(geo_value, time_value, deaths = value) counts_subset <- full_join(x, y, by = c("geo_value", "time_value")) %>% @@ -232,24 +230,22 @@ in public in the past 7 days maintained a distance of at least 6 feet. #| eval: false #| code-fold: true # Download the raw data as used in {epidatasets} -behav_ind_mask <- covidcast( - data_source = "fb-survey", +behav_ind_mask <- pub_covidcast( + source = "fb-survey", signals = "smoothed_wwearing_mask_7d", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = geos) %>% - fetch() %>% select(geo_value, time_value, masking = value) -behav_ind_distancing <- covidcast( - data_source = "fb-survey", +behav_ind_distancing <- pub_covidcast( + source = "fb-survey", signals = "smoothed_wothers_distanced_public", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = geos) %>% - fetch() %>% select(geo_value, time_value, distancing = value) ctis_covid_behaviours <- behav_ind_mask %>% @@ -349,7 +345,7 @@ f <- frosting() %>% by = c("geo_value" = "abbr"), df_pop_col = "pop") -wf <- epi_workflow(r, quantile_reg(tau = c(.05, .5, .95))) %>% +wf <- epi_workflow(r, quantile_reg(quantile_levels = c(.05, .5, .95))) %>% fit(jhu) %>% add_frosting(f) @@ -367,7 +363,7 @@ To look at the prediction intervals: ```{r} p %>% select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>% - pivot_quantiles(.pred_distn_scaled) + pivot_quantiles_wider(.pred_distn_scaled) ``` diff --git a/renv.lock b/renv.lock index a29ca34..82137ae 100644 --- a/renv.lock +++ b/renv.lock @@ -306,7 +306,13 @@ "Package": "bit", "Version": "4.0.5", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "bit", + "RemoteRef": "bit", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "4.0.5", "Requirements": [ "R" ], @@ -330,7 +336,13 @@ "Package": "blob", "Version": "1.2.4", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "blob", + "RemoteRef": "blob", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.2.4", "Requirements": [ "methods", "rlang", @@ -394,7 +406,13 @@ "Package": "callr", "Version": "3.7.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "callr", + "RemoteRef": "callr", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "3.7.3", "Requirements": [ "R", "R6", @@ -541,7 +559,13 @@ "Package": "crayon", "Version": "1.5.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "crayon", + "RemoteRef": "crayon", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.5.2", "Requirements": [ "grDevices", "methods", @@ -601,7 +625,13 @@ "Package": "dbplyr", "Version": "2.3.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "dbplyr", + "RemoteRef": "dbplyr", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "2.3.2", "Requirements": [ "DBI", "R", @@ -643,7 +673,13 @@ "Package": "diagram", "Version": "1.6.5", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "diagram", + "RemoteRef": "diagram", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.6.5", "Requirements": [ "R", "graphics", @@ -734,7 +770,13 @@ "Package": "dtplyr", "Version": "1.3.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "dtplyr", + "RemoteRef": "dtplyr", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.3.1", "Requirements": [ "R", "cli", @@ -795,40 +837,44 @@ }, "epidatr": { "Package": "epidatr", - "Version": "0.6.0", + "Version": "1.0.0.9000", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epidatr", "RemoteRef": "dev", - "RemoteSha": "46d2d54cef8f4d1fcd25a67c74d3d0cafce95f59", + "RemoteSha": "6e9f8996dfbb4a27cff7eebe66ccb04df91caa1a", "Requirements": [ "MMWRweek", "R", + "cachem", "checkmate", "cli", + "glue", "httr", "jsonlite", "magrittr", + "openssl", + "purrr", "readr", - "rlang", "tibble", + "usethis", "xml2" ], - "Hash": "5191033dee038243674ad98615672138" + "Hash": "ccf545a4147535c885aec5280369bbd5" }, "epipredict": { "Package": "epipredict", - "Version": "0.0.5", + "Version": "0.0.6", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epipredict", "RemoteRef": "main", - "RemoteSha": "206f0ef8faf00a882aaf0d0750fd72c2cb752c08", - "Remotes": "cmu-delphi/epidatr, cmu-delphi/epiprocess@dev, dajmcdon/smoothqr", + "RemoteSha": "378577a213aa59043dea73e8fdea725432f32f16", + "Remotes": "cmu-delphi/epidatr, cmu-delphi/epiprocess, dajmcdon/smoothqr", "Requirements": [ "R", "cli", @@ -839,6 +885,7 @@ "generics", "glue", "hardhat", + "lifecycle", "magrittr", "methods", "parsnip", @@ -854,18 +901,18 @@ "vctrs", "workflows" ], - "Hash": "4a8fbcf019dc804a18e677c9d9dca925" + "Hash": "6510b50d6231ea104960ae17175a3205" }, "epiprocess": { "Package": "epiprocess", - "Version": "0.6.0.9999", + "Version": "0.7.0.9999", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epiprocess", "RemoteRef": "dev", - "RemoteSha": "572f6e63b865df4a1c439e127257168f9fa15a96", + "RemoteSha": "b444a3cd718034ebdb10e95d1d7f6c00cac0b1d9", "Remotes": "cmu-delphi/epidatr, reconverse/outbreaks, glmgen/genlasso", "Requirements": [ "R", @@ -890,7 +937,7 @@ "utils", "vctrs" ], - "Hash": "fe93046d168663fc1c68820f31b653ed" + "Hash": "57dcbaf9ee409ffea0dbc61077915c79" }, "evaluate": { "Package": "evaluate", @@ -995,7 +1042,13 @@ "Package": "forcats", "Version": "1.0.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "forcats", + "RemoteRef": "forcats", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.0.0", "Requirements": [ "R", "cli", @@ -1035,7 +1088,13 @@ "Package": "furrr", "Version": "0.3.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "furrr", + "RemoteRef": "furrr", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "0.3.1", "Requirements": [ "R", "future", @@ -1051,7 +1110,13 @@ "Package": "future", "Version": "1.32.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "future", + "RemoteRef": "future", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.32.0", "Requirements": [ "digest", "globals", @@ -1066,7 +1131,13 @@ "Package": "future.apply", "Version": "1.11.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "future.apply", + "RemoteRef": "future.apply", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.11.0", "Requirements": [ "R", "future", @@ -1271,7 +1342,13 @@ "Package": "gower", "Version": "1.0.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "gower", + "RemoteRef": "gower", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.0.1", "Hash": "7a0051eef852c301b5efe2f7913dd45f" }, "gtable": { @@ -1308,7 +1385,13 @@ "Package": "haven", "Version": "2.5.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "haven", + "RemoteRef": "haven", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "2.5.2", "Requirements": [ "R", "cli", @@ -1340,7 +1423,13 @@ "Package": "hms", "Version": "1.1.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "hms", + "RemoteRef": "hms", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.1.3", "Requirements": [ "lifecycle", "methods", @@ -1480,7 +1569,13 @@ "Package": "ipred", "Version": "0.9-14", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "ipred", + "RemoteRef": "ipred", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "0.9-14", "Requirements": [ "MASS", "R", @@ -1591,7 +1686,13 @@ "Package": "lava", "Version": "1.7.2.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "lava", + "RemoteRef": "lava", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.7.2.1", "Requirements": [ "R", "SQUAREM", @@ -1611,7 +1712,13 @@ "Package": "lhs", "Version": "1.1.6", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "lhs", + "RemoteRef": "lhs", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.1.6", "Requirements": [ "R", "Rcpp" @@ -1635,7 +1742,13 @@ "Package": "listenv", "Version": "0.9.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "listenv", + "RemoteRef": "listenv", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "0.9.0", "Requirements": [ "R" ], @@ -1753,7 +1866,13 @@ "Package": "modelr", "Version": "0.1.11", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "modelr", + "RemoteRef": "modelr", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "0.1.11", "Requirements": [ "R", "broom", @@ -1850,7 +1969,13 @@ "Package": "parallelly", "Version": "1.36.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "parallelly", + "RemoteRef": "parallelly", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.36.0", "Requirements": [ "parallel", "tools", @@ -1908,7 +2033,13 @@ "Package": "patchwork", "Version": "1.1.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "patchwork", + "RemoteRef": "patchwork", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.1.2", "Requirements": [ "ggplot2", "grDevices", @@ -2006,7 +2137,13 @@ "Package": "prodlim", "Version": "2023.03.31", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "prodlim", + "RemoteRef": "prodlim", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "2023.03.31", "Requirements": [ "KernSmooth", "R", @@ -2111,7 +2248,13 @@ "Package": "ragg", "Version": "1.2.5", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "ragg", + "RemoteRef": "ragg", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.2.5", "Requirements": [ "systemfonts", "textshaping" @@ -2145,7 +2288,13 @@ "Package": "readr", "Version": "2.1.4", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "readr", + "RemoteRef": "readr", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "2.1.4", "Requirements": [ "R", "R6", @@ -2168,7 +2317,13 @@ "Package": "readxl", "Version": "1.4.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "readxl", + "RemoteRef": "readxl", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.4.2", "Requirements": [ "R", "cellranger", @@ -2243,7 +2398,13 @@ "Package": "reprex", "Version": "2.0.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "reprex", + "RemoteRef": "reprex", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "2.0.2", "Requirements": [ "R", "callr", @@ -2371,7 +2532,13 @@ "Package": "rvest", "Version": "1.0.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "rvest", + "RemoteRef": "rvest", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.0.3", "Requirements": [ "R", "cli", @@ -2471,7 +2638,13 @@ "Package": "shape", "Version": "1.4.6", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "shape", + "RemoteRef": "shape", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.4.6", "Requirements": [ "R", "grDevices", @@ -2701,7 +2874,13 @@ "Package": "tidyverse", "Version": "2.0.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "tidyverse", + "RemoteRef": "tidyverse", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "2.0.0", "Requirements": [ "R", "broom", @@ -2741,7 +2920,13 @@ "Package": "timeDate", "Version": "4022.108", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "timeDate", + "RemoteRef": "timeDate", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "4022.108", "Requirements": [ "R", "graphics", @@ -2915,7 +3100,13 @@ "Package": "vroom", "Version": "1.6.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", + "RemoteType": "standard", + "RemotePkgRef": "vroom", + "RemoteRef": "vroom", + "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", + "RemotePkgPlatform": "source", + "RemoteSha": "1.6.3", "Requirements": [ "R", "bit64", diff --git a/slide.qmd b/slide.qmd index dc680fa..3cd4a4f 100644 --- a/slide.qmd +++ b/slide.qmd @@ -172,7 +172,7 @@ arx_forecaster <- function( args_list = arx_args_list( lags = c(0, 7, 14), ahead = 7, - levels = c(0.05, 0.95) + quantile_levels = c(0.05, 0.95) ) ) @@ -205,7 +205,7 @@ fcasts <- epi_slide( fcasts <- fcasts %>% select(geo_value, time_value, cases_7dav, contains("_distn"), fc_target_date) %>% - pivot_quantiles(contains("_distn")) + pivot_quantiles_wider(contains("_distn")) fcasts ``` @@ -239,7 +239,7 @@ k_week_ahead <- function(ahead = 7) { new_col_name = "fc") %>% select(geo_value, time_value, cases_7dav, contains("_distn"), fc_target_date) %>% - pivot_quantiles(contains("_distn")) + pivot_quantiles_wider(contains("_distn")) } # First generate the forecasts, and bind them together diff --git a/sliding-forecasters.qmd b/sliding-forecasters.qmd index 06bd583..327f9e4 100644 --- a/sliding-forecasters.qmd +++ b/sliding-forecasters.qmd @@ -100,7 +100,7 @@ fc <- bind_rows( )) %>% list_rbind() ) %>% - pivot_quantiles(contains("_distn")) + pivot_quantiles_wider(contains("_distn")) ``` Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the @@ -193,7 +193,7 @@ can_fc <- bind_rows( )) %>% list_rbind() ) %>% - pivot_quantiles(contains("_distn")) + pivot_quantiles_wider(contains("_distn")) ``` The first figure shows the results for all of the provinces using linear regression. @@ -284,7 +284,7 @@ k_week_versioning <- function(ahead, version = c("faithful", "unfaithful")) { fc <- bind_rows( map(aheads, ~ k_week_versioning(.x, "faithful")) %>% list_rbind(), map(aheads, ~ k_week_versioning(.x, "unfaithful")) %>% list_rbind() -) %>% pivot_quantiles(fc_.pred_distn) +) %>% pivot_quantiles_wider(fc_.pred_distn) ``` Now we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity. From 9367e63f6f9a884e7c86f04a91c2779706c137c2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brook" Date: Fri, 15 Dec 2023 05:18:28 -0800 Subject: [PATCH 2/8] Add additional updated svg's in _freeze just in case A first pass visual diff did not identify any "real" differences with the current versions, and suggests that some of these changes might just be a tiny offset and/or size adjustment, but this would need a double check before trying to avoid the updates to the _freeze & corresponding repo bloat. --- .../epidf/figure-html/unnamed-chunk-11-1.svg | 461 +- .../epidf/figure-html/unnamed-chunk-13-1.svg | 729 +- .../epidf/figure-html/unnamed-chunk-15-1.svg | 2951 ++-- .../figure-html/unnamed-chunk-12-1.svg | 691 +- .../figure-html/unnamed-chunk-13-1.svg | 813 +- .../figure-html/unnamed-chunk-14-1.svg | 875 +- .../figure-html/unnamed-chunk-11-1.svg | 599 +- .../figure-html/unnamed-chunk-11-2.svg | 653 +- .../figure-html/unnamed-chunk-4-1.svg | 3799 +++-- .../figure-html/unnamed-chunk-5-1.svg | 377 +- .../figure-html/unnamed-chunk-7-1.svg | 611 +- .../figure-html/unnamed-chunk-9-1.svg | 615 +- .../index/figure-html/unnamed-chunk-8-1.svg | 418 +- .../figure-html/unnamed-chunk-3-1.svg | 537 +- .../figure-html/unnamed-chunk-7-1.svg | 1278 +- .../figure-html/unnamed-chunk-7-2.svg | 1224 +- .../figure-html/unnamed-chunk-9-1.svg | 553 +- .../figure-html/unnamed-chunk-9-1.svg | 481 +- .../slide/figure-html/unnamed-chunk-12-1.svg | 3219 ++-- .../slide/figure-html/unnamed-chunk-8-1.svg | 13652 ++++++++-------- .../figure-html/plot-ar-asof-1.svg | 1998 ++- .../figure-html/plot-arx-1.svg | 1983 +-- .../figure-html/plot-can-fc-boost-1.svg | 5468 +++---- .../figure-html/plot-can-fc-lr-1.svg | 5135 +++--- 24 files changed, 24241 insertions(+), 24879 deletions(-) diff --git a/_freeze/epidf/figure-html/unnamed-chunk-11-1.svg b/_freeze/epidf/figure-html/unnamed-chunk-11-1.svg index c63da36..dea577d 100644 --- a/_freeze/epidf/figure-html/unnamed-chunk-11-1.svg +++ b/_freeze/epidf/figure-html/unnamed-chunk-11-1.svgdiff --git a/_freeze/epidf/figure-html/unnamed-chunk-13-1.svg b/_freeze/epidf/figure-html/unnamed-chunk-13-1.svg index 2a98268..86623eb 100644 --- a/_freeze/epidf/figure-html/unnamed-chunk-13-1.svg +++ b/_freeze/epidf/figure-html/unnamed-chunk-13-1.svg @@ -3,836 +3,833 @@ - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + - + - + - + - + - + - + - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + + - + - + - - + + - + - - - + + + - + - + - - + + - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - + + + + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + @@ -843,41 +840,41 @@ - - - - - - - - - - + + + + + + + + + + - - - - - - - - - + + + + + + + + + - - - - - + + + + + - - - - - - + + + + + + diff --git a/_freeze/epidf/figure-html/unnamed-chunk-15-1.svg b/_freeze/epidf/figure-html/unnamed-chunk-15-1.svg index e865362..c196909 100644 --- a/_freeze/epidf/figure-html/unnamed-chunk-15-1.svg +++ b/_freeze/epidf/figure-html/unnamed-chunk-15-1.svg @@ -3,2095 +3,2092 @@ - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + - + - + - + - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - - - - - - - - + + + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - + - - + + - - + + - - + + - - + + - - - - - - - - - + + + + + + + + + - + - - + + - - + + - - - + + + - + - - + + - - + + - - + + - - - - + + + + - + - + - - + + - - + + - - - - + + + + - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_freeze/flatline-forecaster/figure-html/unnamed-chunk-12-1.svg b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-12-1.svg index 7cf05d8..1c11a3f 100644 --- a/_freeze/flatline-forecaster/figure-html/unnamed-chunk-12-1.svg +++ b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-12-1.svgdiff --git a/_freeze/flatline-forecaster/figure-html/unnamed-chunk-13-1.svg b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-13-1.svg index d2f2597..1c11a3f 100644 --- a/_freeze/flatline-forecaster/figure-html/unnamed-chunk-13-1.svg +++ b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-13-1.svgdiff --git a/_freeze/flatline-forecaster/figure-html/unnamed-chunk-14-1.svg b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-14-1.svg index d2f2597..9d59241 100644 --- a/_freeze/flatline-forecaster/figure-html/unnamed-chunk-14-1.svg +++ b/_freeze/flatline-forecaster/figure-html/unnamed-chunk-14-1.svg @@ -3,880 +3,877 @@ - - + - + - + - + - + - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - - + + - + - + - + - - - - - - - - - + + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - - - - - - + + + + + + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - - - - - - - + + + + + + + + + - - - - + + + + - - - - + + + + - - - - + + + + - - - - + + + + - - - - - - - - - - + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + diff --git a/_freeze/growth-rates/figure-html/unnamed-chunk-11-1.svg b/_freeze/growth-rates/figure-html/unnamed-chunk-11-1.svg index 4258fe2..d6c9162 100644 --- a/_freeze/growth-rates/figure-html/unnamed-chunk-11-1.svg +++ b/_freeze/growth-rates/figure-html/unnamed-chunk-11-1.svgdiff --git a/_freeze/growth-rates/figure-html/unnamed-chunk-11-2.svg b/_freeze/growth-rates/figure-html/unnamed-chunk-11-2.svg index 7a42322..f7011ae 100644 --- a/_freeze/growth-rates/figure-html/unnamed-chunk-11-2.svg +++ b/_freeze/growth-rates/figure-html/unnamed-chunk-11-2.svgdiff --git a/_freeze/growth-rates/figure-html/unnamed-chunk-4-1.svg b/_freeze/growth-rates/figure-html/unnamed-chunk-4-1.svg index 3af0467..5ad0a11 100644 --- a/_freeze/growth-rates/figure-html/unnamed-chunk-4-1.svg +++ b/_freeze/growth-rates/figure-html/unnamed-chunk-4-1.svgdiff --git a/_freeze/growth-rates/figure-html/unnamed-chunk-5-1.svg b/_freeze/growth-rates/figure-html/unnamed-chunk-5-1.svg index a88542f..d066ef0 100644 --- a/_freeze/growth-rates/figure-html/unnamed-chunk-5-1.svg +++ b/_freeze/growth-rates/figure-html/unnamed-chunk-5-1.svgdiff --git a/_freeze/growth-rates/figure-html/unnamed-chunk-7-1.svg b/_freeze/growth-rates/figure-html/unnamed-chunk-7-1.svg index 2d8c3c6..00a9f54 100644 --- a/_freeze/growth-rates/figure-html/unnamed-chunk-7-1.svg +++ b/_freeze/growth-rates/figure-html/unnamed-chunk-7-1.svgdiff --git a/_freeze/growth-rates/figure-html/unnamed-chunk-9-1.svg b/_freeze/growth-rates/figure-html/unnamed-chunk-9-1.svg index 054ac42..043bbe9 100644 --- a/_freeze/growth-rates/figure-html/unnamed-chunk-9-1.svg +++ b/_freeze/growth-rates/figure-html/unnamed-chunk-9-1.svgdiff --git a/_freeze/index/figure-html/unnamed-chunk-8-1.svg b/_freeze/index/figure-html/unnamed-chunk-8-1.svg index af27ae9..bdae1b0 100644 --- a/_freeze/index/figure-html/unnamed-chunk-8-1.svg +++ b/_freeze/index/figure-html/unnamed-chunk-8-1.svgdiff --git a/_freeze/outliers/figure-html/unnamed-chunk-3-1.svg b/_freeze/outliers/figure-html/unnamed-chunk-3-1.svg index 7f2fe6a..c6617b1 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-3-1.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-3-1.svgdiff --git a/_freeze/outliers/figure-html/unnamed-chunk-7-1.svg b/_freeze/outliers/figure-html/unnamed-chunk-7-1.svg index 95e4b93..32ce016 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-7-1.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-7-1.svgdiff --git a/_freeze/outliers/figure-html/unnamed-chunk-7-2.svg b/_freeze/outliers/figure-html/unnamed-chunk-7-2.svg index 3bb681b..d2e3de1 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-7-2.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-7-2.svgdiff --git a/_freeze/outliers/figure-html/unnamed-chunk-9-1.svg b/_freeze/outliers/figure-html/unnamed-chunk-9-1.svg index e7224f5..c42bd19 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-9-1.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-9-1.svgdiff --git a/_freeze/preprocessing-and-models/figure-html/unnamed-chunk-9-1.svg b/_freeze/preprocessing-and-models/figure-html/unnamed-chunk-9-1.svg index dbd950f..b1d6713 100644 --- a/_freeze/preprocessing-and-models/figure-html/unnamed-chunk-9-1.svg +++ b/_freeze/preprocessing-and-models/figure-html/unnamed-chunk-9-1.svgdiff --git a/_freeze/slide/figure-html/unnamed-chunk-12-1.svg b/_freeze/slide/figure-html/unnamed-chunk-12-1.svg index 5470452..1c5a420 100644 --- a/_freeze/slide/figure-html/unnamed-chunk-12-1.svg +++ b/_freeze/slide/figure-html/unnamed-chunk-12-1.svgdiff --git a/_freeze/slide/figure-html/unnamed-chunk-8-1.svg b/_freeze/slide/figure-html/unnamed-chunk-8-1.svg index e7ca1cb..ad8c8f0 100644 --- a/_freeze/slide/figure-html/unnamed-chunk-8-1.svg +++ b/_freeze/slide/figure-html/unnamed-chunk-8-1.svgdiff --git a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg index dee5222..b6ff68b 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svgdiff --git a/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg b/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg index 45697e8..4145a41 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-arx-1.svgdiff --git a/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg b/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg index a6359a2..c6d0522 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svgdiff --git a/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg b/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg index 0d0888b..92e22be 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svgrom 77a684fc81921b7dc45d742d63b6f0ae7178f103 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 30 Apr 2024 19:36:04 -0700 Subject: [PATCH 3/8] feat: update for epiprocess R6 refactor * remove references to R6 and mutation * use epiprocess correctly * fix the authors section of DESCRIPTION * upgrade renv * update all packages in renv * integrate Rprofile with user Rprofile --- .Rprofile | 6 + DESCRIPTION | 11 +- archive.qmd | 91 ++--- epiprocess.qmd | 18 +- renv.lock | 962 ++++++++++++++++++++---------------------------- renv/activate.R | 448 +++++++++++++++------- 6 files changed, 769 insertions(+), 767 deletions(-) diff --git a/.Rprofile b/.Rprofile index 81b960f..b7ee845 100644 --- a/.Rprofile +++ b/.Rprofile @@ -1 +1,7 @@ source("renv/activate.R") + +# Check if user .Rprofile exists +if (file.exists("~/.Rprofile")) { + # Source user .Rprofile + source("~/.Rprofile") +} diff --git a/DESCRIPTION b/DESCRIPTION index 3a0985a..0a92101 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,11 +2,12 @@ Package: delphitoolingbook Title: Delphi Tooling Version: 0.0.0.9999 Authors@R: c( - person("Daniel", "McDonald", "J.", "daniel@stat.ubc.ca", role = c("cre", "aut"), - person("Logan", "Brooks", role = c("cre","aut"), - person("Rachel", "Lobay", role = "aut")) - person("Ryan", "Tibshirani", "J.", "ryantibs@berkeley.edu", role = "aut"), -Description: + person("Daniel", "McDonald", "J.", "daniel@stat.ubc.ca", role = c("cre", "aut")), + person("Logan", "Brooks", role = c("cre","aut")), + person("Rachel", "Lobay", role = "aut"), + person("Ryan", "Tibshirani", "J.", "ryantibs@berkeley.edu", role = "aut") + ) +Description: | This book is a longform introduction to analysing and forecasting epidemiological data. License: MIT + file LICENSE Imports: diff --git a/archive.qmd b/archive.qmd index d873f2c..0650f1b 100644 --- a/archive.qmd +++ b/archive.qmd @@ -25,9 +25,8 @@ source("_common.R") ## Getting data into `epi_archive` format -An `epi_archive` object -can be constructed from a data frame, data table, or tibble, provided that it -has (at least) the following columns: +An `epi_archive` object can be constructed from a data frame, data table, or +tibble, provided that it has (at least) the following columns: * `geo_value`: the geographic value associated with each row of measurements. * `time_value`: the time value associated with each row of measurements. @@ -55,10 +54,10 @@ class(x) print(x) ``` -An `epi_archive` is special kind of class called an R6 class. Its primary field -is a data table `DT`, which is of class `data.table` (from the `data.table` -package), and has columns `geo_value`, `time_value`, `version`, as well as any -number of additional columns. +An `epi_archive` is an S3 class. Its primary field is a data table `DT`, which +is of class `data.table` (from the `data.table` package), and has columns +`geo_value`, `time_value`, `version`, as well as any number of additional +columns. ```{r} class(x$DT) @@ -70,33 +69,18 @@ for the data table, as well as any other specified in the metadata (described below). There can only be a single row per unique combination of key variables, and therefore the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version (also described below). - + ```{r, error=TRUE} key(x$DT) ``` - -In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions. **A word of caution:** R6 objects, -unlike most other objects in R, have reference semantics. An important -consequence of this is that objects are not copied when modified. - -```{r} -original_value <- x$DT$percent_cli[1] -y <- x # This DOES NOT make a copy of x -y$DT$percent_cli[1] = 0 -head(y$DT) -head(x$DT) -x$DT$percent_cli[1] <- original_value -``` -To make a copy, we can use the `clone()` method for an R6 class, as in `y <- -x$clone()`. You can read more about reference semantics in Hadley Wickham's -[Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. +In general, the last version of each observation is carried forward (LOCF) to +fill in data between recorded versions. ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` -object: +object: * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. @@ -112,10 +96,8 @@ call (as it did in the case above). A key method of an `epi_archive` class is `as_of()`, which generates a snapshot of the archive in `epi_df` format. This represents the most up-to-date values of -the signal variables as of a given version. This can be accessed via `x$as_of()` -for an `epi_archive` object `x`, but the package also provides a simple wrapper -function `epix_as_of()` since this is likely a more familiar interface for users -not familiar with R6 (or object-oriented programming). +the signal variables as of a given version. This can be accessed via +`epix_as_of()`. ```{r} x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) @@ -125,7 +107,7 @@ max(x_snapshot$time_value) attributes(x_snapshot)$metadata$as_of ``` -We can see that the max time value in the `epi_df` object `x_snapshot` that was +We can see that the max time value in the `epi_df` object `x_snapshot` that was generated from the archive is May 29, 2021, even though the specified version date was June 1, 2021. From this we can infer that the doctor's visits signal was 2 days latent on June 1. Also, we can see that the metadata in the `epi_df` @@ -134,7 +116,7 @@ object has the version date recorded in the `as_of` field. By default, using the maximum of the `version` column in the underlying data table in an `epi_archive` object itself generates a snapshot of the latest values of signal variables in the entire archive. The `epix_as_of()` function issues a warning in -this case, since updates to the current version may still come in at a later +this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} @@ -143,15 +125,15 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version)) Below, we pull several snapshots from the archive, spaced one month apart. We overlay the corresponding signal curves as colored lines, with the version dates -marked by dotted vertical lines, and draw the latest curve in black (from the +marked by dotted vertical lines, and draw the latest curve in black (from the latest snapshot `x_latest` that the archive can provide). ```{r, fig.width = 8, fig.height = 7} self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map( - versions, - function(v) { + versions, + function(v) { epix_as_of(x, max_version = v) %>% mutate(version = v) }) %>% list_rbind() %>% @@ -162,37 +144,35 @@ snapshots <- map( ```{r, fig.height=7} #| code-fold: true ggplot(snapshots %>% filter(!latest), - aes(x = time_value, y = percent_cli)) + - geom_line(aes(color = factor(version)), na.rm = TRUE) + + aes(x = time_value, y = percent_cli)) + + geom_line(aes(color = factor(version)), na.rm = TRUE) + geom_vline(aes(color = factor(version), xintercept = version), lty = 2) + facet_wrap(~ geo_value, scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %Y") + scale_color_viridis_d(option = "A", end = .9) + - labs(x = "Date", y = "% of doctor's visits with CLI") + + labs(x = "Date", y = "% of doctor's visits with CLI") + theme(legend.position = "none") + geom_line(data = snapshots %>% filter(latest), - aes(x = time_value, y = percent_cli), + aes(x = time_value, y = percent_cli), inherit.aes = FALSE, color = "black", na.rm = TRUE) ``` We can see some interesting and highly nontrivial revision behavior: at some points in time the provisional data snapshots grossly underestimate the latest curve (look in particular at Florida close to the end of 2021), and at others -they overestimate it (both states towards the beginning of 2021), though not +they overestimate it (both states towards the beginning of 2021), though not quite as dramatically. Modeling the revision process, which is often called *backfill modeling*, is an important statistical problem in it of itself. -## Merging `epi_archive` objects +## Merging `epi_archive` objects Now we demonstrate how to merge two `epi_archive` objects together, e.g., so that grabbing data from multiple sources as of a particular version can be -performed with a single `as_of` call. The `epi_archive` class provides a method -`merge()` precisely for this purpose. The wrapper function is called -`epix_merge()`; this wrapper avoids mutating its inputs, while `x$merge` will -mutate `x`. Below we merge the working `epi_archive` of versioned percentage CLI -from outpatient visits to another one of versioned COVID-19 case reporting data, -which we fetch the from the [COVIDcast +performed with a single `as_of` call. The `epiprocess` packages provides +`epix_merge()` for this purpose. Below we merge the working `epi_archive` of +versioned percentage CLI from outpatient visits to another one of versioned +COVID-19 case reporting data, which we fetch the from the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the rate scale (counts per 100,000 people in the population). @@ -209,7 +189,7 @@ When merging archives, unless the archives have identical data release patterns, the other). ```{r, message = FALSE, warning = FALSE,eval=FALSE} -# This code is for illustration and doesn't run. +# This code is for illustration and doesn't run. # The result is saved/loaded in the (hidden) next chunk from `{epidatasets}` y <- pub_covidcast( source = "jhu-csse", @@ -223,24 +203,13 @@ y <- pub_covidcast( select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% as_epi_archive(compactify = TRUE) -x$merge(y, sync = "locf", compactify = FALSE) +x <- epix_merge(x, y, sync = "locf", compactify = FALSE) print(x) head(x$DT) ``` -```{r, echo=FALSE} -x <- archive_cases_dv_subset -print(x) -head(x$DT) -``` - -Importantly, see that `x$merge` mutated `x` to hold the result of the merge. We -could also have used `xy = epix_merge(x, y)` to avoid mutating `x`. See the -documentation for either for more detailed descriptions of what mutation, -pointer aliasing, and pointer reseating is possible. - ## Sliding version-aware computations - + ::: {.callout-note} TODO: need a simple example here. ::: diff --git a/epiprocess.qmd b/epiprocess.qmd index 82a4690..e2e5776 100644 --- a/epiprocess.qmd +++ b/epiprocess.qmd @@ -15,17 +15,17 @@ contains the most up-to-date values of the signals variables, as of a given time. By convention, functions in the `epiprocess` package that operate on `epi_df` -objects begin with `epi`. For example: +objects begin with `epi`. For example: - `epi_slide()`, for iteratively applying a custom computation to a variable in an `epi_df` object over sliding windows in time; - + - `epi_cor()`, for computing lagged correlations between variables in an `epi_df` object, (allowing for grouping by geo value, time value, or any other variables). Functions in the package that operate directly on given variables do not begin - with `epi`. For example: + with `epi`. For example: - `growth_rate()`, for estimating the growth rate of a given signal at given time values, using various methodologies; @@ -35,20 +35,18 @@ Functions in the package that operate directly on given variables do not begin ## `epi_archive`: full version history of a data set -The second main data structure in the package is called -[`epi_archive`]. This is a special class (R6 format) -wrapped around a data table that stores the archive (version history) of some -signal variables of interest. +The second main data structure in the package is called [`epi_archive`]. This is +an S3 class containing a data table that stores the archive (version history) of +some signal variables of interest. By convention, functions in the `epiprocess` package that operate on `epi_archive` objects begin with `epix` (the "x" is meant to remind you of -"archive"). These are just wrapper functions around the public methods for the -`epi_archive` R6 class. For example: +"archive"). For example: - `epix_as_of()`, for generating a snapshot in `epi_df` format from the data archive, which represents the most up-to-date values of the signal variables, as of the specified version; - + - `epix_fill_through_version()`, for filling in some fake version data following simple rules, for use when downstream methods expect an archive that is more up-to-date (e.g., if it is a forecasting deadline date and one of our data diff --git a/renv.lock b/renv.lock index 82137ae..ad975e4 100644 --- a/renv.lock +++ b/renv.lock @@ -1,41 +1,41 @@ { "R": { - "Version": "4.3.0", + "Version": "4.3.3", "Repositories": [ { - "Name": "CRAN", - "URL": "https://cloud.r-project.org" + "Name": "RSPM", + "URL": "https://packagemanager.posit.co/all/latest" } ] }, "Packages": { "BH": { "Package": "BH", - "Version": "1.81.0-1", + "Version": "1.84.0-0", "Source": "Repository", "Repository": "RSPM", - "Hash": "68122010f01c4dcfbe58ce7112f2433d" + "Hash": "a8235afbcd6316e6e91433ea47661013" }, "DBI": { "Package": "DBI", - "Version": "1.1.3", + "Version": "1.2.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "b2866e62bab9378c3cc9476a1954226b" + "Hash": "164809cd72e1d5160b4cb3aa57f510fe" }, "DiceDesign": { "Package": "DiceDesign", - "Version": "1.9", + "Version": "1.10", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "b7b812ae4484d4bbf0a0baac72e8fc01" + "Hash": "ac8b12951882c375d1a14f64c93e78f1" }, "GPfit": { "Package": "GPfit", @@ -50,14 +50,14 @@ }, "KernSmooth": { "Package": "KernSmooth", - "Version": "2.23-21", + "Version": "2.23-22", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "stats" ], - "Hash": "6314fc110e09548ba889491db6ae67fb" + "Hash": "2fecebc3047322fa5930f74fae5de70f" }, "MASS": { "Package": "MASS", @@ -83,11 +83,12 @@ }, "Matrix": { "Package": "Matrix", - "Version": "1.5-4", + "Version": "1.6-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", + "grDevices", "graphics", "grid", "lattice", @@ -95,11 +96,11 @@ "stats", "utils" ], - "Hash": "e779c7d9f35cc364438578f334cffee2" + "Hash": "8c7115cd3a0e048bda2a7cd110549f7a" }, "MatrixModels": { "Package": "MatrixModels", - "Version": "0.5-1", + "Version": "0.5-3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -108,61 +109,7 @@ "methods", "stats" ], - "Hash": "963ab8fbaf980a5b081ed40419081439" - }, - "R.cache": { - "Package": "R.cache", - "Version": "0.16.0", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "R.methodsS3", - "R.oo", - "R.utils", - "digest", - "utils" - ], - "Hash": "fe539ca3f8efb7410c3ae2cf5fe6c0f8" - }, - "R.methodsS3": { - "Package": "R.methodsS3", - "Version": "1.8.2", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "utils" - ], - "Hash": "278c286fd6e9e75d0c2e8f731ea445c8" - }, - "R.oo": { - "Package": "R.oo", - "Version": "1.25.0", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "R.methodsS3", - "methods", - "utils" - ], - "Hash": "a0900a114f4f0194cf4aa8cd4a700681" - }, - "R.utils": { - "Package": "R.utils", - "Version": "2.12.2", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "R.methodsS3", - "R.oo", - "methods", - "tools", - "utils" - ], - "Hash": "325f01db13da12c04d8f6e7be36ff514" + "Hash": "0776bf7526869e0286b0463cb72fb211" }, "R6": { "Package": "R6", @@ -186,28 +133,27 @@ }, "Rcpp": { "Package": "Rcpp", - "Version": "1.0.10", + "Version": "1.0.12", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "methods", "utils" ], - "Hash": "e749cae40fa9ef469b6050959517453c" + "Hash": "5ea2700d21e038ace58269ecdbeb9ec0" }, "RcppEigen": { "Package": "RcppEigen", - "Version": "0.3.3.9.3", + "Version": "0.3.4.0.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "Matrix", "R", "Rcpp", "stats", "utils" ], - "Hash": "1e035db628cefb315c571202d70202fe" + "Hash": "df49e3306f232ec28f1604e36a202847" }, "RcppRoll": { "Package": "RcppRoll", @@ -258,13 +204,13 @@ }, "askpass": { "Package": "askpass", - "Version": "1.1", + "Version": "1.2.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "sys" ], - "Hash": "e8a22846fff485f0be3770c2da758713" + "Hash": "cad6cf7f1d5f6e906700b9d3e718c796" }, "backports": { "Package": "backports", @@ -288,7 +234,7 @@ }, "bayestestR": { "Package": "bayestestR", - "Version": "0.13.1", + "Version": "0.13.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -300,7 +246,7 @@ "stats", "utils" ], - "Hash": "61f643ea5ee9fe0e70ab0246340b3c2e" + "Hash": "4a6a2eebe2db1dfb1c792c4ed91e73dc" }, "bit": { "Package": "bit", @@ -352,9 +298,9 @@ }, "broom": { "Package": "broom", - "Version": "1.0.4", + "Version": "1.0.5", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "backports", @@ -369,27 +315,29 @@ "tibble", "tidyr" ], - "Hash": "f62b2504021369a2449c54bbda362d30" + "Hash": "fd25391c3c4f6ecf0fa95f1e6d15378c" }, "bslib": { "Package": "bslib", - "Version": "0.4.2", + "Version": "0.7.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "base64enc", "cachem", + "fastmap", "grDevices", "htmltools", "jquerylib", "jsonlite", + "lifecycle", "memoise", "mime", "rlang", "sass" ], - "Hash": "a7fbf03946ad741129dc81098722fca1" + "Hash": "8644cc53f43828f19133548195d7e59e" }, "cachem": { "Package": "cachem", @@ -404,22 +352,16 @@ }, "callr": { "Package": "callr", - "Version": "3.7.3", + "Version": "3.7.6", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "callr", - "RemoteRef": "callr", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "3.7.3", "Requirements": [ "R", "R6", "processx", "utils" ], - "Hash": "9b2191ede20fa29828139b9900922e51" + "Hash": "d7e13f49c19103ece9e58ad2d83a7354" }, "cellranger": { "Package": "cellranger", @@ -435,7 +377,7 @@ }, "checkmate": { "Package": "checkmate", - "Version": "2.2.0", + "Version": "2.3.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -443,7 +385,7 @@ "backports", "utils" ], - "Hash": "ca9c113196136f4a9ca9ce6079c2c99e" + "Hash": "c01cab1cb0f9125211a6fc99d540e315" }, "class": { "Package": "class", @@ -460,14 +402,14 @@ }, "cli": { "Package": "cli", - "Version": "3.6.1", + "Version": "3.6.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "89e6d8219950eac806ae0c489052048a" + "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52" }, "clipr": { "Package": "clipr", @@ -497,13 +439,13 @@ }, "codetools": { "Package": "codetools", - "Version": "0.2-19", + "Version": "0.2-20", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "c089a619a7fae175d149d89164f8c7d8" + "Hash": "61e097f35917d342622f21cdc79c256e" }, "colorspace": { "Package": "colorspace", @@ -550,10 +492,13 @@ }, "cpp11": { "Package": "cpp11", - "Version": "0.4.3", + "Version": "0.4.7", "Source": "Repository", "Repository": "RSPM", - "Hash": "ed588261931ee3be2c700d22e94a29ab" + "Requirements": [ + "R" + ], + "Hash": "5a295d7d963cc5035284dcdbaf334f4e" }, "crayon": { "Package": "crayon", @@ -575,7 +520,7 @@ }, "credentials": { "Package": "credentials", - "Version": "1.3.2", + "Version": "2.0.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -585,53 +530,47 @@ "openssl", "sys" ], - "Hash": "93762d0a34d78e6a025efdbfb5c6bb41" + "Hash": "c7844b32098dcbd1c59cbd8dddb4ecc6" }, "curl": { "Package": "curl", - "Version": "5.0.0", + "Version": "5.2.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "e4f97056611e8e6b8b852d13b7400cf1" + "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" }, "data.table": { "Package": "data.table", - "Version": "1.14.8", + "Version": "1.15.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "b4c06e554f33344e044ccd7fdca750a9" + "Hash": "8ee9ac56ef633d0c7cab8b2ca87d683e" }, "datawizard": { "Package": "datawizard", - "Version": "0.7.1", + "Version": "0.10.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "insight", "stats", "utils" ], - "Hash": "001775ea47f92ef8e2916ed0cfa59735" + "Hash": "62d6ec10346d3302a1299e1c54641d83" }, "dbplyr": { "Package": "dbplyr", - "Version": "2.3.2", + "Version": "2.5.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "dbplyr", - "RemoteRef": "dbplyr", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "2.3.2", "Requirements": [ "DBI", "R", @@ -653,21 +592,20 @@ "vctrs", "withr" ], - "Hash": "d24305b92db333726aed162a2c23a147" + "Hash": "39b2e002522bfd258039ee4e889e0fd1" }, "desc": { "Package": "desc", - "Version": "1.4.2", + "Version": "1.4.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "R6", "cli", - "rprojroot", "utils" ], - "Hash": "6b9602c7ebbe87101a9c8edb6e8b6d21" + "Hash": "99b79fcbd6c4d1ce087f5c5c758b384f" }, "diagram": { "Package": "diagram", @@ -690,7 +628,7 @@ }, "dials": { "Package": "dials", - "Version": "1.2.0", + "Version": "1.2.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -710,42 +648,54 @@ "vctrs", "withr" ], - "Hash": "ce71836ecc0efd70890c6825c8b4ff47" + "Hash": "999e5fa12058a2bb3a8c204e637e4707" }, "digest": { "Package": "digest", - "Version": "0.6.31", + "Version": "0.6.35", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "8b708f296afd9ae69f450f9640be8990" + "Hash": "698ece7ba5a4fa4559e3d537e7ec3d31" }, "distributional": { "Package": "distributional", - "Version": "0.3.2", + "Version": "0.4.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "digest", - "farver", "generics", - "ggplot2", "lifecycle", "numDeriv", "rlang", - "scales", "stats", "utils", "vctrs" ], - "Hash": "0a94c3c917918a1c90f4609171ff41b6" + "Hash": "3bad76869f2257ea4fd00a3c08c2bcce" + }, + "doFuture": { + "Package": "doFuture", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "foreach", + "future", + "future.apply", + "globals", + "iterators", + "parallel", + "utils" + ], + "Hash": "bd269daa182b205fa471c89ee9dcc8df" }, "dplyr": { "Package": "dplyr", - "Version": "1.1.2", + "Version": "1.1.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -764,7 +714,7 @@ "utils", "vctrs" ], - "Hash": "dea6970ff715ca541c387de363ff405e" + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" }, "dtplyr": { "Package": "dtplyr", @@ -793,7 +743,7 @@ }, "effectsize": { "Package": "effectsize", - "Version": "0.8.3", + "Version": "0.8.7", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -806,7 +756,7 @@ "stats", "utils" ], - "Hash": "7e2f03436f6d34d7c856dc437b657274" + "Hash": "a6900d0b5bdcbb956dbb843643279e7c" }, "ellipsis": { "Package": "ellipsis", @@ -828,66 +778,70 @@ "RemoteUsername": "cmu-delphi", "RemoteRepo": "epidatasets", "RemoteRef": "main", - "RemoteSha": "cc8f2a05ed5f0d3ae383997c763c96dd009aa9b5", - "Remotes": "cmu-delphi/epidatr, cmu-delphi/epipredict, cmu-delphi/epiprocess", + "RemoteSha": "ca86f0326e4eb08316b40972c7d3c98217e9941e", + "Remotes": "cmu-delphi/epidatr, cmu-delphi/epiprocess", "Requirements": [ "R" ], - "Hash": "d79d32cc56c4b996ec778c8a18e9bf82" + "Hash": "3deba70da0ce06354cbd3206b16e36a2" }, "epidatr": { "Package": "epidatr", - "Version": "1.0.0.9000", + "Version": "1.1.5", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epidatr", "RemoteRef": "dev", - "RemoteSha": "6e9f8996dfbb4a27cff7eebe66ccb04df91caa1a", + "RemoteSha": "626c30bc07f4aae3c3e6a6c6b825a6cd5eee1ce7", "Requirements": [ "MMWRweek", "R", "cachem", + "cachem", "checkmate", "cli", "glue", + "glue", "httr", "jsonlite", "magrittr", "openssl", "purrr", + "rappdirs", "readr", "tibble", "usethis", + "usethis", "xml2" ], - "Hash": "ccf545a4147535c885aec5280369bbd5" + "Hash": "869d57a2ad4002670ad28939fe050e82" }, "epipredict": { "Package": "epipredict", - "Version": "0.0.6", + "Version": "0.0.14", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epipredict", - "RemoteRef": "main", - "RemoteSha": "378577a213aa59043dea73e8fdea725432f32f16", + "RemoteRef": "dev", + "RemoteSha": "5e50a5a112b663eff85fcac5586875352157a5c4", "Remotes": "cmu-delphi/epidatr, cmu-delphi/epiprocess, dajmcdon/smoothqr", "Requirements": [ "R", + "checkmate", "cli", "distributional", "dplyr", "epiprocess", - "fs", "generics", + "ggplot2", "glue", "hardhat", "lifecycle", "magrittr", - "methods", "parsnip", "quantreg", "recipes", @@ -897,26 +851,27 @@ "tibble", "tidyr", "tidyselect", - "usethis", + "tsibble", "vctrs", "workflows" ], - "Hash": "6510b50d6231ea104960ae17175a3205" + "Hash": "4531cf03e3c8955857df663d7366a8f4" }, "epiprocess": { "Package": "epiprocess", - "Version": "0.7.0.9999", + "Version": "0.7.7", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epiprocess", "RemoteRef": "dev", - "RemoteSha": "b444a3cd718034ebdb10e95d1d7f6c00cac0b1d9", + "RemoteSha": "4e65e51bb56ab70cc98fa2d37dd35a4ab2336620", "Remotes": "cmu-delphi/epidatr, reconverse/outbreaks, glmgen/genlasso", "Requirements": [ "R", "R6", + "checkmate", "cli", "data.table", "dplyr", @@ -924,6 +879,7 @@ "feasts", "generics", "genlasso", + "ggplot2", "lifecycle", "lubridate", "magrittr", @@ -937,22 +893,22 @@ "utils", "vctrs" ], - "Hash": "57dcbaf9ee409ffea0dbc61077915c79" + "Hash": "998ba22373923380e1ce7e787d11af18" }, "evaluate": { "Package": "evaluate", - "Version": "0.21", + "Version": "0.23", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "d59f3b464e8da1aef82dc04b588b8dfb" + "Hash": "daf4a1246be12c1fa8c7705a0935c1a0" }, "fabletools": { "Package": "fabletools", - "Version": "0.3.3", + "Version": "0.4.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -961,10 +917,12 @@ "distributional", "dplyr", "generics", + "ggdist", "ggplot2", "lifecycle", "progressr", "rlang", + "scales", "stats", "tibble", "tidyr", @@ -973,11 +931,11 @@ "utils", "vctrs" ], - "Hash": "192130a87f00fecf175f752ec72c1db7" + "Hash": "005e92a674b01825e0feb29931c03c5e" }, "fansi": { "Package": "fansi", - "Version": "1.0.4", + "Version": "1.0.6", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -985,7 +943,7 @@ "grDevices", "utils" ], - "Hash": "1d9e7ad3c8312a192dea7d3db0274fde" + "Hash": "962174cf2aeb5b9eea581522286a911f" }, "farver": { "Package": "farver", @@ -1003,7 +961,7 @@ }, "feasts": { "Package": "feasts", - "Version": "0.3.1", + "Version": "0.3.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1024,11 +982,11 @@ "utils", "vctrs" ], - "Hash": "4e2130fecd2802de19c69d129abab1db" + "Hash": "d15631c019c27e50b1a99e3e9b3b53e1" }, "fontawesome": { "Package": "fontawesome", - "Version": "0.5.1", + "Version": "0.5.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1036,7 +994,7 @@ "htmltools", "rlang" ], - "Hash": "1e22b8cabbad1eae951a75e9f8b52378" + "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" }, "forcats": { "Package": "forcats", @@ -1075,14 +1033,14 @@ }, "fs": { "Package": "fs", - "Version": "1.6.2", + "Version": "1.6.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "94af08e0aa9675a16fadbb3aaaa90d2a" + "Hash": "15aeb8c27f5ea5161f9f6a641fafd93a" }, "furrr": { "Package": "furrr", @@ -1108,15 +1066,9 @@ }, "future": { "Package": "future", - "Version": "1.32.0", + "Version": "1.33.2", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "future", - "RemoteRef": "future", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.32.0", "Requirements": [ "digest", "globals", @@ -1125,19 +1077,13 @@ "parallelly", "utils" ], - "Hash": "c68517cf2f78be4ea86e140b8598a4ca" + "Hash": "fd7b1d69d16d0d114e4fa82db68f184c" }, "future.apply": { "Package": "future.apply", - "Version": "1.11.0", + "Version": "1.11.2", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "future.apply", - "RemoteRef": "future.apply", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.11.0", "Requirements": [ "R", "future", @@ -1145,13 +1091,13 @@ "parallel", "utils" ], - "Hash": "ba4be138fe47eac3e16a6deaa4da106e" + "Hash": "afe1507511629f44572e6c53b9baeb7c" }, "gargle": { "Package": "gargle", - "Version": "1.4.0", + "Version": "1.5.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cli", @@ -1167,7 +1113,7 @@ "utils", "withr" ], - "Hash": "8c72a723822dc317613da5ff8e8da6ee" + "Hash": "fc0b272e5847c58cd5da9b20eedbd026" }, "generics": { "Package": "generics", @@ -1199,7 +1145,7 @@ }, "gert": { "Package": "gert", - "Version": "1.9.2", + "Version": "2.0.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1210,11 +1156,35 @@ "sys", "zip" ], - "Hash": "9122b3958e749badb5c939f498038b57" + "Hash": "f70d3fe2d9e7654213a946963d1591eb" + }, + "ggdist": { + "Package": "ggdist", + "Version": "3.3.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp", + "cli", + "distributional", + "ggplot2", + "glue", + "grid", + "gtable", + "numDeriv", + "quadprog", + "rlang", + "scales", + "tibble", + "vctrs", + "withr" + ], + "Hash": "86ebb3543cdad6520be9bf8863167a9a" }, "ggplot2": { "Package": "ggplot2", - "Version": "3.4.2", + "Version": "3.5.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1235,23 +1205,25 @@ "vctrs", "withr" ], - "Hash": "3a147ee02e85a8941aad9909f1b43b7b" + "Hash": "44c6a2f8202d5b7e878ea274b1092426" }, "gh": { "Package": "gh", - "Version": "1.4.0", + "Version": "1.4.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "cli", "gitcreds", + "glue", "httr2", "ini", "jsonlite", + "lifecycle", "rlang" ], - "Hash": "03533b1c875028233598f848fda44c4c" + "Hash": "fbbbc48eba7a6626a08bb365e44b563b" }, "gitcreds": { "Package": "gitcreds", @@ -1265,31 +1237,31 @@ }, "globals": { "Package": "globals", - "Version": "0.16.2", + "Version": "0.16.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "codetools" ], - "Hash": "baa9585ab4ce47a9f4618e671778cc6f" + "Hash": "2580567908cafd4f187c1e5a91e98b7f" }, "glue": { "Package": "glue", - "Version": "1.6.2", + "Version": "1.7.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "4f2596dfb05dac67b9dc558e5c6fba2e" + "Hash": "e0b3a53876554bd45879e596cdb10a52" }, "googledrive": { "Package": "googledrive", - "Version": "2.1.0", + "Version": "2.1.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cli", @@ -1308,13 +1280,13 @@ "vctrs", "withr" ], - "Hash": "e88ba642951bc8d1898ba0d12581850b" + "Hash": "e99641edef03e2a5e87f0a0b1fcc97f4" }, "googlesheets4": { "Package": "googlesheets4", - "Version": "1.1.0", + "Version": "1.1.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cellranger", @@ -1336,7 +1308,7 @@ "vctrs", "withr" ], - "Hash": "fd7b97bd862a14297b0bb7ed28a3dada" + "Hash": "d6db1667059d027da730decdc214b959" }, "gower": { "Package": "gower", @@ -1353,7 +1325,7 @@ }, "gtable": { "Package": "gtable", - "Version": "0.3.3", + "Version": "0.3.5", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1364,11 +1336,11 @@ "lifecycle", "rlang" ], - "Hash": "b44addadb528a0d227794121c00572a0" + "Hash": "e18861963cbc65a27736e02b3cd3c4a0" }, "hardhat": { "Package": "hardhat", - "Version": "1.3.0", + "Version": "1.3.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1379,19 +1351,13 @@ "tibble", "vctrs" ], - "Hash": "b56b42c50bb7c76a683e8e61f415d828" + "Hash": "921fd010cd788de75a9c71c2c3aee1f2" }, "haven": { "Package": "haven", - "Version": "2.5.2", + "Version": "2.5.4", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "haven", - "RemoteRef": "haven", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "2.5.2", "Requirements": [ "R", "cli", @@ -1406,7 +1372,7 @@ "tidyselect", "vctrs" ], - "Hash": "8b331e659e67d757db0fcc28e689c501" + "Hash": "9171f898db9d9c4c1b2c745adc2c1ef1" }, "highr": { "Package": "highr", @@ -1441,26 +1407,25 @@ }, "htmltools": { "Package": "htmltools", - "Version": "0.5.5", + "Version": "0.5.8.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "base64enc", "digest", - "ellipsis", "fastmap", "grDevices", "rlang", "utils" ], - "Hash": "ba0240784ad50a62165058a27459304a" + "Hash": "81d371a9cc60640e74e4ab6ac46dcedc" }, "httr": { "Package": "httr", - "Version": "1.4.6", + "Version": "1.4.7", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "R6", @@ -1469,11 +1434,11 @@ "mime", "openssl" ], - "Hash": "7e5e3cbd2a7bc07880c94e22348fb661" + "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" }, "httr2": { "Package": "httr2", - "Version": "0.2.3", + "Version": "1.0.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1482,13 +1447,15 @@ "cli", "curl", "glue", + "lifecycle", "magrittr", "openssl", "rappdirs", "rlang", + "vctrs", "withr" ], - "Hash": "193bb297368afbbb42dc85784a46b36e" + "Hash": "03d741c92fda96d98c3a3f22494e3b4a" }, "ids": { "Package": "ids", @@ -1503,9 +1470,9 @@ }, "igraph": { "Package": "igraph", - "Version": "1.5.0", + "Version": "2.0.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "Matrix", "R", @@ -1513,37 +1480,42 @@ "cpp11", "grDevices", "graphics", + "lifecycle", "magrittr", "methods", "pkgconfig", "rlang", "stats", - "utils" + "utils", + "vctrs" ], - "Hash": "84818361421d5fc3ff0bf4e669524217" + "Hash": "c3b7d801d722e26e4cd888e042bf9af5" }, "infer": { "Package": "infer", - "Version": "1.0.4", + "Version": "1.0.7", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "broom", + "cli", "dplyr", "generics", "ggplot2", "glue", "grDevices", + "lifecycle", "magrittr", "methods", "patchwork", "purrr", "rlang", "tibble", - "tidyr" + "tidyr", + "vctrs" ], - "Hash": "b1aa2741a03a90aa9d8187997cfc55c9" + "Hash": "8d30ac9c5e21efd8575f934bdb5c3029" }, "ini": { "Package": "ini", @@ -1554,7 +1526,7 @@ }, "insight": { "Package": "insight", - "Version": "0.19.2", + "Version": "0.19.10", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1563,7 +1535,7 @@ "stats", "utils" ], - "Hash": "e5389ad6bc8445203e2bbbe3f42fc96f" + "Hash": "c15a38c9655cba66f5f5537a14c1bef4" }, "ipred": { "Package": "ipred", @@ -1621,17 +1593,17 @@ }, "jsonlite": { "Package": "jsonlite", - "Version": "1.8.5", + "Version": "1.8.8", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "methods" ], - "Hash": "3ee4d9899e4db3e976fc82b98d24a31a" + "Hash": "e1b9c55281c5adc4dd113652d9e26768" }, "knitr": { "Package": "knitr", - "Version": "1.43", + "Version": "1.46", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1643,33 +1615,22 @@ "xfun", "yaml" ], - "Hash": "9775eb076713f627c07ce41d8199d8f6" + "Hash": "6e008ab1d696a5283c79765fa7b56b47" }, "labeling": { "Package": "labeling", - "Version": "0.4.2", + "Version": "0.4.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "graphics", "stats" ], - "Hash": "3d5108641f47470611a32d0bdf357a72" - }, - "later": { - "Package": "later", - "Version": "1.3.1", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "Rcpp", - "rlang" - ], - "Hash": "40401c9cf2bc2259dfe83311c9384710" + "Hash": "b64ec208ac5bc1852b285f665d6368b3" }, "lattice": { "Package": "lattice", - "Version": "0.21-8", + "Version": "0.22-6", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1680,22 +1641,17 @@ "stats", "utils" ], - "Hash": "0b8a6d63c8770f02a8b5635f3c431e6b" + "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" }, "lava": { "Package": "lava", - "Version": "1.7.2.1", + "Version": "1.8.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "lava", - "RemoteRef": "lava", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.7.2.1", "Requirements": [ "R", "SQUAREM", + "cli", "future.apply", "grDevices", "graphics", @@ -1706,7 +1662,7 @@ "survival", "utils" ], - "Hash": "bbc70840ea0f91f34dd9703efe4c96c3" + "Hash": "579303ca1e817d94cea694b319803380" }, "lhs": { "Package": "lhs", @@ -1727,7 +1683,7 @@ }, "lifecycle": { "Package": "lifecycle", - "Version": "1.0.3", + "Version": "1.0.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1736,36 +1692,30 @@ "glue", "rlang" ], - "Hash": "001cecbeac1cff9301bdc3775ee46a86" + "Hash": "b8552d117e1b808b09a832f589b79035" }, "listenv": { "Package": "listenv", - "Version": "0.9.0", + "Version": "0.9.1", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "listenv", - "RemoteRef": "listenv", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "0.9.0", "Requirements": [ "R" ], - "Hash": "4fbd3679ec8ee169ba28d4b1ea7d0e8f" + "Hash": "e2fca3e12e4db979dccc6e519b10a7ee" }, "lubridate": { "Package": "lubridate", - "Version": "1.9.2", + "Version": "1.9.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "generics", "methods", "timechange" ], - "Hash": "e25f18436e3efd42c7c590a1c4c15390" + "Hash": "680ad542fbcf801442c83a6ac5a2126c" }, "magrittr": { "Package": "magrittr", @@ -1790,9 +1740,9 @@ }, "mgcv": { "Package": "mgcv", - "Version": "1.8-42", + "Version": "1.9-1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "Matrix", "R", @@ -1803,7 +1753,7 @@ "stats", "utils" ], - "Hash": "3460beba7ccc8946249ba35327ba902a" + "Hash": "110ee9d83b496279960e162ac97764ce" }, "mime": { "Package": "mime", @@ -1817,7 +1767,7 @@ }, "modelbased": { "Package": "modelbased", - "Version": "0.8.6", + "Version": "0.8.7", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1832,11 +1782,11 @@ "stats", "utils" ], - "Hash": "60a0171d055dc3122d7d2b6ed31e8003" + "Hash": "857859a5dd55f53a2c6ab14fbdb6acc1" }, "modeldata": { "Package": "modeldata", - "Version": "1.1.0", + "Version": "1.3.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1847,7 +1797,7 @@ "rlang", "tibble" ], - "Hash": "df65cdee10e24635c6491ed5a98a31ef" + "Hash": "6ac8ee87ffebd14b29586fce684c14cc" }, "modelenv": { "Package": "modelenv", @@ -1888,18 +1838,18 @@ }, "munsell": { "Package": "munsell", - "Version": "0.5.0", + "Version": "0.5.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "colorspace", "methods" ], - "Hash": "6dfe8bf774944bd5595785e3229d8771" + "Hash": "4fd8900853b746af55b81fda99da7695" }, "nlme": { "Package": "nlme", - "Version": "3.1-162", + "Version": "3.1-164", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1909,7 +1859,7 @@ "stats", "utils" ], - "Hash": "0984ce8da8da9ead8643c5cbbb60f83e" + "Hash": "a623a2239e642806158bc4dc3f51565d" }, "nnet": { "Package": "nnet", @@ -1935,13 +1885,13 @@ }, "openssl": { "Package": "openssl", - "Version": "2.0.6", + "Version": "2.1.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "askpass" ], - "Hash": "0f7cd2962e3044bb940cca4f4b5cecbe" + "Hash": "ea2475b073243d9d338aa8f086ce973e" }, "outbreaks": { "Package": "outbreaks", @@ -1953,39 +1903,21 @@ ], "Hash": "a62a28f56f51694490827b57ee78f970" }, - "packrat": { - "Package": "packrat", - "Version": "0.9.1", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "tools", - "utils" - ], - "Hash": "481428983c19a7c443f7ea1beff0a2de" - }, "parallelly": { "Package": "parallelly", - "Version": "1.36.0", + "Version": "1.37.1", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "parallelly", - "RemoteRef": "parallelly", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.36.0", "Requirements": [ "parallel", "tools", "utils" ], - "Hash": "bca377e1c87ec89ebed77bba00635b2e" + "Hash": "5410df8d22bd36e616f2a2343dbb328c" }, "parameters": { "Package": "parameters", - "Version": "0.21.1", + "Version": "0.21.6", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1998,11 +1930,11 @@ "stats", "utils" ], - "Hash": "8694e865941e3d79a29122f60a74249b" + "Hash": "1f1bf75cb49c61df8287a0fa3b68126f" }, "parsnip": { "Package": "parsnip", - "Version": "1.1.0", + "Version": "1.2.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2027,33 +1959,29 @@ "vctrs", "withr" ], - "Hash": "009152502d5125513c353612052e9d4e" + "Hash": "ace928adf616e06ece817d970faa2d03" }, "patchwork": { "Package": "patchwork", - "Version": "1.1.2", + "Version": "1.2.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "patchwork", - "RemoteRef": "patchwork", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.1.2", "Requirements": [ + "cli", "ggplot2", "grDevices", "graphics", "grid", "gtable", + "rlang", "stats", "utils" ], - "Hash": "63b611e9d909a9ed057639d9c3b77152" + "Hash": "9c8ab14c00ac07e9e04d1664c0b74486" }, "performance": { "Package": "performance", - "Version": "0.10.4", + "Version": "0.11.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2061,11 +1989,10 @@ "bayestestR", "datawizard", "insight", - "methods", "stats", "utils" ], - "Hash": "c7bd3750eea3b3f5b2fdd56f22af151a" + "Hash": "eb8ecde248cd610ae3097f5d00718cbd" }, "pillar": { "Package": "pillar", @@ -2115,14 +2042,17 @@ }, "prettyunits": { "Package": "prettyunits", - "Version": "1.1.1", + "Version": "1.2.0", "Source": "Repository", - "Repository": "CRAN", - "Hash": "95ef9167b75dde9d2ccc3c7528393e7e" + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" }, "processx": { "Package": "processx", - "Version": "3.8.1", + "Version": "3.8.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2131,19 +2061,13 @@ "ps", "utils" ], - "Hash": "d75b4059d781336efba24021915902b4" + "Hash": "0c90a7d71988856bad2a2a45dd871bb9" }, "prodlim": { "Package": "prodlim", - "Version": "2023.03.31", + "Version": "2023.08.28", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "prodlim", - "RemoteRef": "prodlim", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "2023.03.31", "Requirements": [ "KernSmooth", "R", @@ -2156,47 +2080,48 @@ "stats", "survival" ], - "Hash": "3f60fadb28cfebdd20b0dd4198a38c60" + "Hash": "c73e09a2039a0f75ac0a1e5454b39993" }, "progress": { "Package": "progress", - "Version": "1.2.2", + "Version": "1.2.3", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ + "R", "R6", "crayon", "hms", "prettyunits" ], - "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + "Hash": "f4625e061cb2865f111b47ff163a5ca6" }, "progressr": { "Package": "progressr", - "Version": "0.13.0", + "Version": "0.14.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "digest", "utils" ], - "Hash": "376a8ebcc878f9c1395e212548fc297a" + "Hash": "ac50c4ffa8f6a46580dd4d7813add3c4" }, "ps": { "Package": "ps", - "Version": "1.7.5", + "Version": "1.7.6", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "709d852d33178db54b17c722e5b1e594" + "Hash": "dd2b9319ee0656c8acf45c7f40c59de7" }, "purrr": { "Package": "purrr", - "Version": "1.0.1", + "Version": "1.0.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2207,11 +2132,21 @@ "rlang", "vctrs" ], - "Hash": "d71c815267c640f17ddbf7f16144b4bb" + "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" + }, + "quadprog": { + "Package": "quadprog", + "Version": "1.5-8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5f919ae5e7f83a6f91dcf2288943370d" }, "quantreg": { "Package": "quantreg", - "Version": "5.95", + "Version": "5.97", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2225,45 +2160,22 @@ "stats", "survival" ], - "Hash": "6cdf5a3dc958f55529b0401a8e5869e3" - }, - "quarto": { - "Package": "quarto", - "Version": "1.2", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "jsonlite", - "later", - "processx", - "rmarkdown", - "rsconnect", - "rstudioapi", - "utils", - "yaml" - ], - "Hash": "298a252816cabed120391c955aced484" + "Hash": "1bbc97f7d637ab3917c514a69047b2c1" }, "ragg": { "Package": "ragg", - "Version": "1.2.5", + "Version": "1.3.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "ragg", - "RemoteRef": "ragg", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.2.5", "Requirements": [ "systemfonts", "textshaping" ], - "Hash": "690bc058ea2b1b8a407d3cfe3dce3ef9" + "Hash": "082e1a198e3329d571f4448ef0ede4bc" }, "ranger": { "Package": "ranger", - "Version": "0.15.1", + "Version": "0.16.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2272,7 +2184,7 @@ "Rcpp", "RcppEigen" ], - "Hash": "ec74161e5a464623b0cdfc435f9926ed" + "Hash": "d5ca3a8d00f088042ea3b638534e0f3d" }, "rappdirs": { "Package": "rappdirs", @@ -2286,15 +2198,9 @@ }, "readr": { "Package": "readr", - "Version": "2.1.4", + "Version": "2.1.5", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "readr", - "RemoteRef": "readr", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "2.1.4", "Requirements": [ "R", "R6", @@ -2311,19 +2217,13 @@ "utils", "vroom" ], - "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + "Hash": "9de96463d2117f6ac49980577939dfb3" }, "readxl": { "Package": "readxl", - "Version": "1.4.2", + "Version": "1.4.3", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "readxl", - "RemoteRef": "readxl", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.4.2", "Requirements": [ "R", "cellranger", @@ -2332,11 +2232,11 @@ "tibble", "utils" ], - "Hash": "2e6020b1399d95f947ed867045e9ca17" + "Hash": "8cf9c239b96df1bbb133b74aef77ad0a" }, "recipes": { "Package": "recipes", - "Version": "1.0.6", + "Version": "1.0.10", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2365,14 +2265,14 @@ "vctrs", "withr" ], - "Hash": "eb53ffc9674dc9a52c3a7e22d96d3f56" + "Hash": "69783cdd607c58fffb21c5c26c6ededf" }, "rematch": { "Package": "rematch", - "Version": "1.0.1", + "Version": "2.0.0", "Source": "Repository", - "Repository": "CRAN", - "Hash": "c66b930d20bb6d858cd18e1cebcfae5c" + "Repository": "RSPM", + "Hash": "cbff1b666c6fa6d21202f07e2318d4f1" }, "rematch2": { "Package": "rematch2", @@ -2386,25 +2286,19 @@ }, "renv": { "Package": "renv", - "Version": "0.17.3", + "Version": "1.0.7", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "utils" ], - "Hash": "4543b8cd233ae25c6aba8548be9e747e" + "Hash": "397b7b2a265bc5a7a06852524dabae20" }, "reprex": { "Package": "reprex", - "Version": "2.0.2", + "Version": "2.1.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "reprex", - "RemoteRef": "reprex", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "2.0.2", "Requirements": [ "R", "callr", @@ -2420,22 +2314,22 @@ "utils", "withr" ], - "Hash": "d66fe009d4c20b7ab1927eb405db9ee2" + "Hash": "1425f91b4d5d9a8f25352c44a3d914ed" }, "rlang": { "Package": "rlang", - "Version": "1.1.1", + "Version": "1.1.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "a85c767b55f0bf9b7ad16c6d7baee5bb" + "Hash": "42548638fae05fd9a9b5f3f437fbbbe2" }, "rmarkdown": { "Package": "rmarkdown", - "Version": "2.22", + "Version": "2.26", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2448,18 +2342,17 @@ "jsonlite", "knitr", "methods", - "stringr", "tinytex", "tools", "utils", "xfun", "yaml" ], - "Hash": "75a01be060d800ceb14e32c666cacac9" + "Hash": "9b148e7f95d33aac01f31282d49e4f44" }, "rpart": { "Package": "rpart", - "Version": "4.1.19", + "Version": "4.1.23", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2468,29 +2361,31 @@ "graphics", "stats" ], - "Hash": "b3c892a81783376cc2204af0f5805a80" + "Hash": "b3d390424f41d04174cccf84d49676c2" }, "rprojroot": { "Package": "rprojroot", - "Version": "2.0.3", + "Version": "2.0.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "1de7ab598047a87bba48434ba35d497d" + "Hash": "4c8415e0ec1e29f3f4f6fc108bef0144" }, "rsample": { "Package": "rsample", - "Version": "1.1.1", + "Version": "1.2.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", + "cli", "dplyr", "furrr", "generics", "glue", + "lifecycle", "methods", "pillar", "purrr", @@ -2501,44 +2396,20 @@ "tidyselect", "vctrs" ], - "Hash": "cb0c54ebc268ec382be8e4d4a8c34557" - }, - "rsconnect": { - "Package": "rsconnect", - "Version": "0.8.29", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "curl", - "digest", - "jsonlite", - "openssl", - "packrat", - "rstudioapi", - "tools", - "yaml" - ], - "Hash": "fe178fc15af80952f546aafedf655b36" + "Hash": "95e0f11d79a7494919c14aa4d8e9e177" }, "rstudioapi": { "Package": "rstudioapi", - "Version": "0.14", + "Version": "0.16.0", "Source": "Repository", "Repository": "RSPM", - "Hash": "690bd2acc42a9166ce34845884459320" + "Hash": "96710351d642b70e8f02ddeb237c46a7" }, "rvest": { "Package": "rvest", - "Version": "1.0.3", + "Version": "1.0.4", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "rvest", - "RemoteRef": "rvest", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.0.3", "Requirements": [ "R", "cli", @@ -2549,14 +2420,13 @@ "rlang", "selectr", "tibble", - "withr", "xml2" ], - "Hash": "a4a5ac819a467808c60e36e92ddf195e" + "Hash": "0bcf0c6f274e90ea314b812a6d19a519" }, "sass": { "Package": "sass", - "Version": "0.4.6", + "Version": "0.4.9", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2566,29 +2436,31 @@ "rappdirs", "rlang" ], - "Hash": "cc3ec7dd33982ef56570229b62d6388e" + "Hash": "d53dbfddf695303ea4ad66f86e99b95d" }, "scales": { "Package": "scales", - "Version": "1.2.1", + "Version": "1.3.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "R6", "RColorBrewer", + "cli", "farver", + "glue", "labeling", "lifecycle", "munsell", "rlang", "viridisLite" ], - "Hash": "906cb23d2f1c5680b8ce439b44c6fa63" + "Hash": "c19df082ba346b0ffa6f833e92de34d1" }, "see": { "Package": "see", - "Version": "0.8.0", + "Version": "0.8.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2606,7 +2478,7 @@ "performance", "stats" ], - "Hash": "7d400d1615814add7e3b59cfef306e03" + "Hash": "3d2fd0b72314499e6af4fd20d39309dc" }, "selectr": { "Package": "selectr", @@ -2636,28 +2508,22 @@ }, "shape": { "Package": "shape", - "Version": "1.4.6", + "Version": "1.4.6.1", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "shape", - "RemoteRef": "shape", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.4.6", "Requirements": [ "R", "grDevices", "graphics", "stats" ], - "Hash": "9067f962730f58b14d8ae54ca885509f" + "Hash": "5c47e84dc0a3ca761ae1d307889e796d" }, "slider": { "Package": "slider", - "Version": "0.3.0", + "Version": "0.3.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cli", @@ -2665,7 +2531,7 @@ "vctrs", "warp" ], - "Hash": "c1c73df260af9e1e3692eb3b8e1ecb88" + "Hash": "a584625e2b9e4fad4be135c8ea5c99aa" }, "smoothqr": { "Package": "smoothqr", @@ -2688,7 +2554,7 @@ }, "stringi": { "Package": "stringi", - "Version": "1.7.12", + "Version": "1.8.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2697,11 +2563,11 @@ "tools", "utils" ], - "Hash": "ca8bd84263c77310739d2cf64d84d7c9" + "Hash": "058aebddea264f4c99401515182e656a" }, "stringr": { "Package": "stringr", - "Version": "1.5.0", + "Version": "1.5.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2714,32 +2580,13 @@ "stringi", "vctrs" ], - "Hash": "671a4d384ae9d32fc47a14e98bfa3dc8" - }, - "styler": { - "Package": "styler", - "Version": "1.10.1", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "R.cache", - "cli", - "magrittr", - "purrr", - "rlang", - "rprojroot", - "tools", - "vctrs", - "withr" - ], - "Hash": "b0911fdb2c682f526f6e9c131fd40a1f" + "Hash": "960e2ae9e09656611e0b8214ad543207" }, "survival": { "Package": "survival", - "Version": "3.5-5", + "Version": "3.6-4", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "Matrix", "R", @@ -2749,7 +2596,7 @@ "stats", "utils" ], - "Hash": "d683341b1fa2e8d817efde27d6e6d35b" + "Hash": "e6e3071f471513e4b85f98ca041303c7" }, "sys": { "Package": "sys", @@ -2760,26 +2607,26 @@ }, "systemfonts": { "Package": "systemfonts", - "Version": "1.0.4", + "Version": "1.0.6", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cpp11" ], - "Hash": "90b28393209827327de889f49935140a" + "Hash": "6d538cff441f0f1f36db2209ac7495ac" }, "textshaping": { "Package": "textshaping", - "Version": "0.3.6", + "Version": "0.3.7", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cpp11", "systemfonts" ], - "Hash": "1ab6223d3670fac7143202cb6a2d43d5" + "Hash": "997aac9ad649e0ef3b97f96cddd5622b" }, "tibble": { "Package": "tibble", @@ -2802,7 +2649,7 @@ }, "tidymodels": { "Package": "tidymodels", - "Version": "1.1.0", + "Version": "1.2.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2829,11 +2676,11 @@ "workflowsets", "yardstick" ], - "Hash": "65f6942e7cb9396aa31daeaf0d79f70c" + "Hash": "c3296bbe8389a31fafc1ee07e69889a7" }, "tidyr": { "Package": "tidyr", - "Version": "1.3.0", + "Version": "1.3.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2852,11 +2699,11 @@ "utils", "vctrs" ], - "Hash": "e47debdc7ce599b070c8e78e8ac0cfcf" + "Hash": "915fb7ce036c22a6a33b5a8adb712eb1" }, "tidyselect": { "Package": "tidyselect", - "Version": "1.2.0", + "Version": "1.2.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2868,7 +2715,7 @@ "vctrs", "withr" ], - "Hash": "79540e5fcd9e0435af547d885f184fd5" + "Hash": "829f27b9c4919c16b593794a6344d6c0" }, "tidyverse": { "Package": "tidyverse", @@ -2918,15 +2765,9 @@ }, "timeDate": { "Package": "timeDate", - "Version": "4022.108", + "Version": "4032.109", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "timeDate", - "RemoteRef": "timeDate", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "4022.108", "Requirements": [ "R", "graphics", @@ -2934,32 +2775,32 @@ "stats", "utils" ], - "Hash": "3f7918d2b36c17ffe07cddba6458453e" + "Hash": "fa276a2ec2555d74b4eabf56fba3d209" }, "timechange": { "Package": "timechange", - "Version": "0.2.0", + "Version": "0.3.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cpp11" ], - "Hash": "8548b44f79a35ba1791308b61e6012d7" + "Hash": "c5f3c201b931cd6474d17d8700ccb1c8" }, "tinytex": { "Package": "tinytex", - "Version": "0.45", + "Version": "0.50", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "xfun" ], - "Hash": "e4e357f28c2edff493936b6cb30c3d65" + "Hash": "be7a76845222ad20adb761f462eed3ea" }, "tsibble": { "Package": "tsibble", - "Version": "1.1.3", + "Version": "1.1.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2976,11 +2817,11 @@ "tidyselect", "vctrs" ], - "Hash": "a9d10100310663a7583cf7e16a990e6f" + "Hash": "d5da786ac5a28f62ca2eb8255ad7b9f3" }, "tune": { "Package": "tune", - "Version": "1.1.1", + "Version": "1.2.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2988,8 +2829,10 @@ "R", "cli", "dials", + "doFuture", "dplyr", "foreach", + "future", "generics", "ggplot2", "glue", @@ -3008,7 +2851,7 @@ "workflows", "yardstick" ], - "Hash": "abf2edf028c09305eaf0159fbb27d851" + "Hash": "7fbdbcd58e7a63957b23ddb751b346af" }, "tzdb": { "Package": "tzdb", @@ -3023,7 +2866,7 @@ }, "usethis": { "Package": "usethis", - "Version": "2.2.0", + "Version": "2.2.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -3050,33 +2893,33 @@ "withr", "yaml" ], - "Hash": "a108160a30d25e9aca3444a130dd53d9" + "Hash": "d524fd42c517035027f866064417d7e6" }, "utf8": { "Package": "utf8", - "Version": "1.2.3", + "Version": "1.2.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "1fe17157424bb09c48a8b3b550c753bc" + "Hash": "62b65c52671e6665f803ff02954446e9" }, "uuid": { "Package": "uuid", - "Version": "1.1-0", + "Version": "1.2-0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "f1cb46c157d080b729159d407be83496" + "Hash": "303c19bfd970bece872f93a824e323d9" }, "vctrs": { "Package": "vctrs", - "Version": "0.6.2", + "Version": "0.6.5", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "cli", @@ -3084,7 +2927,7 @@ "lifecycle", "rlang" ], - "Hash": "a745bda7aff4734c17294bb41d4e4607" + "Hash": "c03fa420630029418f7e6da3667aac4a" }, "viridisLite": { "Package": "viridisLite", @@ -3098,15 +2941,9 @@ }, "vroom": { "Package": "vroom", - "Version": "1.6.3", + "Version": "1.6.5", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "vroom", - "RemoteRef": "vroom", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.6.3", "Requirements": [ "R", "bit64", @@ -3126,17 +2963,17 @@ "vctrs", "withr" ], - "Hash": "8318e64ffb3a70e652494017ec455561" + "Hash": "390f9315bc0025be03012054103d227c" }, "warp": { "Package": "warp", - "Version": "0.2.0", + "Version": "0.2.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "2982481615756e24e79fee95bdc95daa" + "Hash": "fea474d578b1cbcb696ae6ac8bdcc439" }, "whisker": { "Package": "whisker", @@ -3147,20 +2984,19 @@ }, "withr": { "Package": "withr", - "Version": "2.5.0", + "Version": "3.0.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "grDevices", - "graphics", - "stats" + "graphics" ], - "Hash": "c0e49a9760983e81e55cdd9be92e7182" + "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" }, "workflows": { "Package": "workflows", - "Version": "1.1.3", + "Version": "1.1.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -3176,11 +3012,11 @@ "tidyselect", "vctrs" ], - "Hash": "553cd1d3d88da41e40f70b04f657a164" + "Hash": "f2c2cefdf6babfed4594b33479d19fc3" }, "workflowsets": { "Package": "workflowsets", - "Version": "1.0.1", + "Version": "1.1.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -3206,22 +3042,23 @@ "withr", "workflows" ], - "Hash": "7393bad2a9c83d549e79bd641f6bf67a" + "Hash": "ff4540bb4cccc1dd2447d58a97158820" }, "xfun": { "Package": "xfun", - "Version": "0.39", + "Version": "0.43", "Source": "Repository", "Repository": "RSPM", "Requirements": [ + "grDevices", "stats", "tools" ], - "Hash": "8f56e9acb54fb525e66464d57ab58bcb" + "Hash": "ab6371d8653ce5f2f9290f4ec7b42a8e" }, "xgboost": { "Package": "xgboost", - "Version": "1.7.5.1", + "Version": "1.7.7.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -3231,29 +3068,31 @@ "jsonlite", "methods" ], - "Hash": "6b8d09cf8ffc148162a4d57d01e61490" + "Hash": "6303e61eac62aef7bd2b396ef7e24386" }, "xml2": { "Package": "xml2", - "Version": "1.3.4", + "Version": "1.3.6", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", - "methods" + "cli", + "methods", + "rlang" ], - "Hash": "7dc765ac9b909487326a7d471fdd3821" + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" }, "yaml": { "Package": "yaml", - "Version": "2.3.7", + "Version": "2.3.8", "Source": "Repository", "Repository": "RSPM", - "Hash": "0d0056cc5383fbc240ccd0cb584bf436" + "Hash": "29240487a071f535f5e5d5a323b7afbd" }, "yardstick": { "Package": "yardstick", - "Version": "1.2.0", + "Version": "1.3.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -3267,16 +3106,17 @@ "tibble", "tidyselect", "utils", - "vctrs" + "vctrs", + "withr" ], - "Hash": "935418860629e50d2a2c495ea0a05221" + "Hash": "9ce4117141b326c4fffc7c42e56e0f88" }, "zip": { "Package": "zip", - "Version": "2.3.0", + "Version": "2.3.1", "Source": "Repository", "Repository": "RSPM", - "Hash": "d98c94dacb7e0efcf83b0a133a705504" + "Hash": "fcc4bd8e6da2d2011eb64a5e5cc685ab" } } } diff --git a/renv/activate.R b/renv/activate.R index a8fdc32..d13f993 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -2,10 +2,28 @@ local({ # the requested version of renv - version <- "0.17.3" + version <- "1.0.7" + attr(version, "sha") <- NULL # the project directory - project <- getwd() + project <- Sys.getenv("RENV_PROJECT") + if (!nzchar(project)) + project <- getwd() + + # use start-up diagnostics if enabled + diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") + if (diagnostics) { + start <- Sys.time() + profile <- tempfile("renv-startup-", fileext = ".Rprof") + utils::Rprof(profile) + on.exit({ + utils::Rprof(NULL) + elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) + writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) + writeLines(sprintf("- Profile: %s", profile)) + print(utils::summaryRprof(profile)) + }, add = TRUE) + } # figure out whether the autoloader is enabled enabled <- local({ @@ -15,6 +33,14 @@ local({ if (!is.null(override)) return(override) + # if we're being run in a context where R_LIBS is already set, + # don't load -- presumably we're being run as a sub-process and + # the parent process has already set up library paths for us + rcmd <- Sys.getenv("R_CMD", unset = NA) + rlibs <- Sys.getenv("R_LIBS", unset = NA) + if (!is.na(rlibs) && !is.na(rcmd)) + return(FALSE) + # next, check environment variables # TODO: prefer using the configuration one in the future envvars <- c( @@ -34,9 +60,22 @@ local({ }) - if (!enabled) + # bail if we're not enabled + if (!enabled) { + + # if we're not enabled, we might still need to manually load + # the user profile here + profile <- Sys.getenv("R_PROFILE_USER", unset = "~/.Rprofile") + if (file.exists(profile)) { + cfg <- Sys.getenv("RENV_CONFIG_USER_PROFILE", unset = "TRUE") + if (tolower(cfg) %in% c("true", "t", "1")) + sys.source(profile, envir = globalenv()) + } + return(FALSE) + } + # avoid recursion if (identical(getOption("renv.autoloader.running"), TRUE)) { warning("ignoring recursive attempt to run renv autoloader") @@ -60,25 +99,90 @@ local({ # load bootstrap tools `%||%` <- function(x, y) { - if (is.environment(x) || length(x)) x else y + if (is.null(x)) y else x } - `%??%` <- function(x, y) { - if (is.null(x)) y else x + catf <- function(fmt, ..., appendLF = TRUE) { + + quiet <- getOption("renv.bootstrap.quiet", default = FALSE) + if (quiet) + return(invisible()) + + msg <- sprintf(fmt, ...) + cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") + + invisible(msg) + + } + + header <- function(label, + ..., + prefix = "#", + suffix = "-", + n = min(getOption("width"), 78)) + { + label <- sprintf(label, ...) + n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) + if (n <= 0) + return(paste(prefix, label)) + + tail <- paste(rep.int(suffix, n), collapse = "") + paste0(prefix, " ", label, " ", tail) + + } + + heredoc <- function(text, leave = 0) { + + # remove leading, trailing whitespace + trimmed <- gsub("^\\s*\\n|\\n\\s*$", "", text) + + # split into lines + lines <- strsplit(trimmed, "\n", fixed = TRUE)[[1L]] + + # compute common indent + indent <- regexpr("[^[:space:]]", lines) + common <- min(setdiff(indent, -1L)) - leave + paste(substring(lines, common), collapse = "\n") + + } + + startswith <- function(string, prefix) { + substring(string, 1, nchar(prefix)) == prefix } bootstrap <- function(version, library) { + friendly <- renv_bootstrap_version_friendly(version) + section <- header(sprintf("Bootstrapping renv %s", friendly)) + catf(section) + # attempt to download renv - tarball <- tryCatch(renv_bootstrap_download(version), error = identity) - if (inherits(tarball, "error")) - stop("failed to download renv ", version) + catf("- Downloading renv ... ", appendLF = FALSE) + withCallingHandlers( + tarball <- renv_bootstrap_download(version), + error = function(err) { + catf("FAILED") + stop("failed to download:\n", conditionMessage(err)) + } + ) + catf("OK") + on.exit(unlink(tarball), add = TRUE) # now attempt to install - status <- tryCatch(renv_bootstrap_install(version, tarball, library), error = identity) - if (inherits(status, "error")) - stop("failed to install renv ", version) + catf("- Installing renv ... ", appendLF = FALSE) + withCallingHandlers( + status <- renv_bootstrap_install(version, tarball, library), + error = function(err) { + catf("FAILED") + stop("failed to install:\n", conditionMessage(err)) + } + ) + catf("OK") + + # add empty line to break up bootstrapping from normal output + catf("") + return(invisible()) } renv_bootstrap_tests_running <- function() { @@ -108,13 +212,6 @@ local({ if (!inherits(repos, "error") && length(repos)) return(repos) - # if we're testing, re-use the test repositories - if (renv_bootstrap_tests_running()) { - repos <- getOption("renv.tests.repos") - if (!is.null(repos)) - return(repos) - } - # retrieve current repos repos <- getOption("repos") @@ -158,33 +255,34 @@ local({ renv_bootstrap_download <- function(version) { - # if the renv version number has 4 components, assume it must - # be retrieved via github - nv <- numeric_version(version) - components <- unclass(nv)[[1]] - - # if this appears to be a development version of 'renv', we'll - # try to restore from github - dev <- length(components) == 4L - - # begin collecting different methods for finding renv - methods <- c( - renv_bootstrap_download_tarball, - if (dev) - renv_bootstrap_download_github - else c( - renv_bootstrap_download_cran_latest, - renv_bootstrap_download_cran_archive + sha <- attr(version, "sha", exact = TRUE) + + methods <- if (!is.null(sha)) { + + # attempting to bootstrap a development version of renv + c( + function() renv_bootstrap_download_tarball(sha), + function() renv_bootstrap_download_github(sha) ) - ) + + } else { + + # attempting to bootstrap a release version of renv + c( + function() renv_bootstrap_download_tarball(version), + function() renv_bootstrap_download_cran_latest(version), + function() renv_bootstrap_download_cran_archive(version) + ) + + } for (method in methods) { - path <- tryCatch(method(version), error = identity) + path <- tryCatch(method(), error = identity) if (is.character(path) && file.exists(path)) return(path) } - stop("failed to download renv ", version) + stop("All download methods failed") } @@ -248,8 +346,6 @@ local({ type <- spec$type repos <- spec$repos - message("* Downloading renv ", version, " ... ", appendLF = FALSE) - baseurl <- utils::contrib.url(repos = repos, type = type) ext <- if (identical(type, "source")) ".tar.gz" @@ -266,13 +362,10 @@ local({ condition = identity ) - if (inherits(status, "condition")) { - message("FAILED") + if (inherits(status, "condition")) return(FALSE) - } # report success and return - message("OK (downloaded ", type, ")") destfile } @@ -329,8 +422,6 @@ local({ urls <- file.path(repos, "src/contrib/Archive/renv", name) destfile <- file.path(tempdir(), name) - message("* Downloading renv ", version, " ... ", appendLF = FALSE) - for (url in urls) { status <- tryCatch( @@ -338,14 +429,11 @@ local({ condition = identity ) - if (identical(status, 0L)) { - message("OK") + if (identical(status, 0L)) return(destfile) - } } - message("FAILED") return(FALSE) } @@ -368,7 +456,7 @@ local({ if (!file.exists(tarball)) { # let the user know we weren't able to honour their request - fmt <- "* RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." + fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." msg <- sprintf(fmt, tarball) warning(msg) @@ -377,10 +465,7 @@ local({ } - fmt <- "* Bootstrapping with tarball at path '%s'." - msg <- sprintf(fmt, tarball) - message(msg) - + catf("- Using local tarball '%s'.", tarball) tarball } @@ -407,8 +492,6 @@ local({ on.exit(do.call(base::options, saved), add = TRUE) } - message("* Downloading renv ", version, " from GitHub ... ", appendLF = FALSE) - url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) name <- sprintf("renv_%s.tar.gz", version) destfile <- file.path(tempdir(), name) @@ -418,26 +501,105 @@ local({ condition = identity ) - if (!identical(status, 0L)) { - message("FAILED") + if (!identical(status, 0L)) return(FALSE) - } - message("OK") + renv_bootstrap_download_augment(destfile) + return(destfile) } + # Add Sha to DESCRIPTION. This is stop gap until #890, after which we + # can use renv::install() to fully capture metadata. + renv_bootstrap_download_augment <- function(destfile) { + sha <- renv_bootstrap_git_extract_sha1_tar(destfile) + if (is.null(sha)) { + return() + } + + # Untar + tempdir <- tempfile("renv-github-") + on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) + untar(destfile, exdir = tempdir) + pkgdir <- dir(tempdir, full.names = TRUE)[[1]] + + # Modify description + desc_path <- file.path(pkgdir, "DESCRIPTION") + desc_lines <- readLines(desc_path) + remotes_fields <- c( + "RemoteType: github", + "RemoteHost: api.github.com", + "RemoteRepo: renv", + "RemoteUsername: rstudio", + "RemotePkgRef: rstudio/renv", + paste("RemoteRef: ", sha), + paste("RemoteSha: ", sha) + ) + writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) + + # Re-tar + local({ + old <- setwd(tempdir) + on.exit(setwd(old), add = TRUE) + + tar(destfile, compression = "gzip") + }) + invisible() + } + + # Extract the commit hash from a git archive. Git archives include the SHA1 + # hash as the comment field of the tarball pax extended header + # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) + # For GitHub archives this should be the first header after the default one + # (512 byte) header. + renv_bootstrap_git_extract_sha1_tar <- function(bundle) { + + # open the bundle for reading + # We use gzcon for everything because (from ?gzcon) + # > Reading from a connection which does not supply a 'gzip' magic + # > header is equivalent to reading from the original connection + conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) + on.exit(close(conn)) + + # The default pax header is 512 bytes long and the first pax extended header + # with the comment should be 51 bytes long + # `52 comment=` (11 chars) + 40 byte SHA1 hash + len <- 0x200 + 0x33 + res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) + + if (grepl("^52 comment=", res)) { + sub("52 comment=", "", res) + } else { + NULL + } + } + renv_bootstrap_install <- function(version, tarball, library) { # attempt to install it into project library - message("* Installing renv ", version, " ... ", appendLF = FALSE) dir.create(library, showWarnings = FALSE, recursive = TRUE) + output <- renv_bootstrap_install_impl(library, tarball) + + # check for successful install + status <- attr(output, "status") + if (is.null(status) || identical(status, 0L)) + return(status) + + # an error occurred; report it + header <- "installation of renv failed" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- paste(c(header, lines, output), collapse = "\n") + stop(text) + + } + + renv_bootstrap_install_impl <- function(library, tarball) { # invoke using system2 so we can capture and report output bin <- R.home("bin") exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" - r <- file.path(bin, exe) + R <- file.path(bin, exe) args <- c( "--vanilla", "CMD", "INSTALL", "--no-multiarch", @@ -445,19 +607,7 @@ local({ shQuote(path.expand(tarball)) ) - output <- system2(r, args, stdout = TRUE, stderr = TRUE) - message("Done!") - - # check for successful install - status <- attr(output, "status") - if (is.numeric(status) && !identical(status, 0L)) { - header <- "Error installing renv:" - lines <- paste(rep.int("=", nchar(header)), collapse = "") - text <- c(header, lines, output) - writeLines(text, con = stderr()) - } - - status + system2(R, args, stdout = TRUE, stderr = TRUE) } @@ -498,6 +648,9 @@ local({ # if the user has requested an automatic prefix, generate it auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (is.na(auto) && getRversion() >= "4.4.0") + auto <- "TRUE" + if (auto %in% c("TRUE", "True", "true", "1")) return(renv_bootstrap_platform_prefix_auto()) @@ -667,34 +820,61 @@ local({ } - renv_bootstrap_validate_version <- function(version) { + renv_bootstrap_validate_version <- function(version, description = NULL) { + + # resolve description file + # + # avoid passing lib.loc to `packageDescription()` below, since R will + # use the loaded version of the package by default anyhow. note that + # this function should only be called after 'renv' is loaded + # https://github.com/rstudio/renv/issues/1625 + description <- description %||% packageDescription("renv") - loadedversion <- utils::packageDescription("renv", fields = "Version") - if (version == loadedversion) + # check whether requested version 'version' matches loaded version of renv + sha <- attr(version, "sha", exact = TRUE) + valid <- if (!is.null(sha)) + renv_bootstrap_validate_version_dev(sha, description) + else + renv_bootstrap_validate_version_release(version, description) + + if (valid) return(TRUE) - # assume four-component versions are from GitHub; - # three-component versions are from CRAN - components <- strsplit(loadedversion, "[.-]")[[1]] - remote <- if (length(components) == 4L) - paste("rstudio/renv", loadedversion, sep = "@") + # the loaded version of renv doesn't match the requested version; + # give the user instructions on how to proceed + dev <- identical(description[["RemoteType"]], "github") + remote <- if (dev) + paste("rstudio/renv", description[["RemoteSha"]], sep = "@") else - paste("renv", loadedversion, sep = "@") + paste("renv", description[["Version"]], sep = "@") - fmt <- paste( - "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.", - "Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.", - "Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.", - sep = "\n" + # display both loaded version + sha if available + friendly <- renv_bootstrap_version_friendly( + version = description[["Version"]], + sha = if (dev) description[["RemoteSha"]] ) - msg <- sprintf(fmt, loadedversion, version, remote) - warning(msg, call. = FALSE) + fmt <- heredoc(" + renv %1$s was loaded from project library, but this project is configured to use renv %2$s. + - Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile. + - Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library. + ") + catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) FALSE } + renv_bootstrap_validate_version_dev <- function(version, description) { + expected <- description[["RemoteSha"]] + is.character(expected) && startswith(expected, version) + } + + renv_bootstrap_validate_version_release <- function(version, description) { + expected <- description[["Version"]] + is.character(expected) && identical(expected, version) + } + renv_bootstrap_hash_text <- function(text) { hashfile <- tempfile("renv-hash-") @@ -718,7 +898,7 @@ local({ hooks <- getHook("renv::autoload") for (hook in hooks) if (is.function(hook)) - tryCatch(hook(), error = warning) + tryCatch(hook(), error = warnify) # load the project renv::load(project) @@ -859,6 +1039,40 @@ local({ } + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { + sha <- sha %||% attr(version, "sha", exact = TRUE) + parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = "") + } + + renv_bootstrap_exec <- function(project, libpath, version) { + if (!renv_bootstrap_load(project, libpath, version)) + renv_bootstrap_run(version, libpath) + } + + renv_bootstrap_run <- function(version, libpath) { + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + return(renv::load(project = getwd())) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + + } renv_json_read <- function(file = NULL, text = NULL) { @@ -867,7 +1081,7 @@ local({ # if jsonlite is loaded, use that instead if ("jsonlite" %in% loadedNamespaces()) { - json <- catch(renv_json_read_jsonlite(file, text)) + json <- tryCatch(renv_json_read_jsonlite(file, text), error = identity) if (!inherits(json, "error")) return(json) @@ -876,7 +1090,7 @@ local({ } # otherwise, fall back to the default JSON reader - json <- catch(renv_json_read_default(file, text)) + json <- tryCatch(renv_json_read_default(file, text), error = identity) if (!inherits(json, "error")) return(json) @@ -889,14 +1103,14 @@ local({ } renv_json_read_jsonlite <- function(file = NULL, text = NULL) { - text <- paste(text %||% read(file), collapse = "\n") + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") jsonlite::fromJSON(txt = text, simplifyVector = FALSE) } renv_json_read_default <- function(file = NULL, text = NULL) { # find strings in the JSON - text <- paste(text %||% read(file), collapse = "\n") + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' locs <- gregexpr(pattern, text, perl = TRUE)[[1]] @@ -944,14 +1158,14 @@ local({ map <- as.list(map) # remap strings in object - remapped <- renv_json_remap(json, map) + remapped <- renv_json_read_remap(json, map) # evaluate eval(remapped, envir = baseenv()) } - renv_json_remap <- function(json, map) { + renv_json_read_remap <- function(json, map) { # fix names if (!is.null(names(json))) { @@ -978,7 +1192,7 @@ local({ # recurse if (is.recursive(json)) { for (i in seq_along(json)) { - json[i] <- list(renv_json_remap(json[[i]], map)) + json[i] <- list(renv_json_read_remap(json[[i]], map)) } } @@ -998,35 +1212,9 @@ local({ # construct full libpath libpath <- file.path(root, prefix) - # attempt to load - if (renv_bootstrap_load(project, libpath, version)) - return(TRUE) - - # load failed; inform user we're about to bootstrap - prefix <- paste("# Bootstrapping renv", version) - postfix <- paste(rep.int("-", 77L - nchar(prefix)), collapse = "") - header <- paste(prefix, postfix) - message(header) - - # perform bootstrap - bootstrap(version, libpath) - - # exit early if we're just testing bootstrap - if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) - return(TRUE) - - # try again to load - if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { - message("* Successfully installed and loaded renv ", version, ".") - return(renv::load()) - } - - # failed to download or load renv; warn the user - msg <- c( - "Failed to find an renv installation: the project will not be loaded.", - "Use `renv::activate()` to re-initialize the project." - ) + # run bootstrap code + renv_bootstrap_exec(project, libpath, version) - warning(paste(msg, collapse = "\n"), call. = FALSE) + invisible() }) From 20a4a82a859e2e6de44c5727dda5b71a174270d1 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 2 May 2024 17:48:41 -0700 Subject: [PATCH 4/8] refactor+doc: fix a few issues * add a README file * fix broken formatting in packages.bib * get missing data for sliding-forecasters.qmd online instead of local files --- README.md | 22 + _freeze/archive/execute-results/html.json | 4 +- .../archive/figure-html/unnamed-chunk-8-1.svg | 1465 +++ _freeze/index/execute-results/html.json | 4 +- .../execute-results/html.json | 4 +- .../figure-html/plot-ar-asof-1.svg | 3491 +++--- .../figure-html/plot-arx-1.svg | 3459 +++--- .../figure-html/plot-can-fc-boost-1.svg | 9589 +++++++++-------- .../figure-html/plot-can-fc-lr-1.svg | 9506 ++++++++-------- packages.bib | 3 +- sliding-forecasters.qmd | 147 +- 11 files changed, 15130 insertions(+), 12564 deletions(-) create mode 100644 README.md create mode 100644 _freeze/archive/figure-html/unnamed-chunk-8-1.svg diff --git a/README.md b/README.md new file mode 100644 index 0000000..b1d1e8e --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# Delphi Tooling Book + +The book is a collection of articles and tutorials on how to use the Delphi tooling effectively. + +## Compiling the book + +The book is written with [Quarto](https://quarto.org/docs/guide/) (which can be installed [here](https://quarto.org/docs/get-started/)). To compile the book, run the following commands: + +```sh +# Install the R dependencies +R -e 'install.packages(c("pak", "rspm", "renv"))' +R -e 'renv::restore()' + +# Compile the book and preview it +quarto preview +``` + +We use Quarto's freeze feature to re-render only the qmd files that have changed. To force a re-render of a page, run this command: + +```sh +quarto render +``` diff --git a/_freeze/archive/execute-results/html.json b/_freeze/archive/execute-results/html.json index 29d86a0..8c57390 100644 --- a/_freeze/archive/execute-results/html.json +++ b/_freeze/archive/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "4d2b35f719103c28883f0aeeaf2e3e2b", + "hash": "4abbf9d8187ca890d0c13fd0656d50e9", "result": { - "markdown": "# Work with archive objects and data revisions\n\nIn addition to the `epi_df` data structure, which we have been working with all\nalong in these vignettes, the `epiprocess` package has a companion structure\ncalled `epi_archive`. In comparison to an `epi_df` object, which can be seen as\nstoring a single snapshot of a data set with the most up-to-date signal values\nas of some given time, an `epi_archive` object stores the full version history\nof a data set. Many signals of interest for epidemiological tracking are subject\nto revision (some more than others), and paying attention to data revisions can\nbe important for all sorts of downstream data analysis and modeling tasks.\n\nThis chapter walks through working with `epi_archive` objects and demonstrates\nsome of their key functionality. We'll work with a signal on the percentage of\ndoctor's visits with CLI (COVID-like illness) computed from medical insurance\nclaims, available through the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This\nsignal is subject to very heavy and regular revision; you can read more about it\non its [API documentation\npage](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`.\n\n\n\n\n\n\n## Getting data into `epi_archive` format\n\nAn `epi_archive` object\ncan be constructed from a data frame, data table, or tibble, provided that it\nhas (at least) the following columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n* `version`: the time value specifying the version for each row of measurements.\n For example, if in a given row the `version` is January 15, 2022 and\n `time_value` is January 14, 2022, then this row contains the measurements of\n the data for January 14, 2022 that were available one day later.\n\nAs we can see from the above, the data frame returned by\n`epidatr::covidcast()` has the columns required for the `epi_archive`\nformat, so we use\n`as_epi_archive()` to cast it into `epi_archive` format.[^1]\n\n[^1]: For a discussion of the removal of\nredundant version updates in `as_epi_archive` using compactify, please refer\nto the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/compactify.html).\n\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-2_39c5cbdbb56253b327ea66e6ab4e8220'}\n\n```{.r .cell-code}\nx <- archive_cases_dv_subset_dt %>%\n select(geo_value, time_value, version, percent_cli) %>%\n as_epi_archive(compactify = TRUE)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_archive\" \"R6\"\n```\n:::\n\n```{.r .cell-code}\nprint(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_archive` object, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> ----------\n#> * min time value = 2020-06-01\n#> * max time value = 2021-11-30\n#> * first version with update = 2020-06-02\n#> * last version with update = 2021-12-01\n#> * No clobberable versions\n#> * versions end = 2021-12-01\n#> ----------\n#> Data archive (stored in DT field): 119316 x 4\n#> Columns in DT: geo_value, time_value, version, percent_cli\n#> ----------\n#> Public R6 methods: initialize, print, as_of, fill_through_version, \n#> truncate_versions_after, merge, group_by, slide, clone\n```\n:::\n:::\n\n\nAn `epi_archive` is special kind of class called an R6 class. Its primary field\nis a data table `DT`, which is of class `data.table` (from the `data.table`\npackage), and has columns `geo_value`, `time_value`, `version`, as well as any\nnumber of additional columns.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-3_99d23f4e3321a367498344c4b6282562'}\n\n```{.r .cell-code}\nclass(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"data.table\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n:::\n\n\nThe variables `geo_value`, `time_value`, `version` serve as **key variables**\nfor the data table, as well as any other specified in the metadata (described\nbelow). There can only be a single row per unique combination of key variables,\nand therefore the key variables are critical for figuring out how to generate a\nsnapshot of data from the archive, as of a given version (also described below).\n \n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-4_8b3712fe1140194d1eb702521cf15238'}\n\n```{.r .cell-code}\nkey(x$DT)\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in key(x$DT): could not find function \"key\"\n```\n:::\n:::\n\n \nIn general, the last version of each observation is carried forward (LOCF) to\nfill in data between recorded versions. **A word of caution:** R6 objects,\nunlike most other objects in R, have reference semantics. An important\nconsequence of this is that objects are not copied when modified.\n \n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-5_86ba88485d14cbc7ddf328b75c606b4d'}\n\n```{.r .cell-code}\noriginal_value <- x$DT$percent_cli[1]\ny <- x # This DOES NOT make a copy of x\ny$DT$percent_cli[1] <- 0\nhead(y$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 0.000000\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli\n#> 1: ca 2020-06-01 2020-06-02 0.000000\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n\n```{.r .cell-code}\nx$DT$percent_cli[1] <- original_value\n```\n:::\n\n\nTo make a copy, we can use the `clone()` method for an R6 class, as in `y <-\nx$clone()`. You can read more about reference semantics in Hadley Wickham's\n[Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book.\n\n## Some details on metadata\n\nThe following pieces of metadata are included as fields in an `epi_archive`\nobject: \n\n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `additional_metadata`: list of additional metadata for the data archive.\n\nMetadata for an `epi_archive` object `x` can be accessed (and altered) directly,\nas in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function\n`as_epi_archive()` attempts to guess metadata fields when an `epi_archive`\nobject is instantiated, if they are not explicitly specified in the function\ncall (as it did in the case above).\n\n## Producing snapshots in `epi_df` form\n\nA key method of an `epi_archive` class is `as_of()`, which generates a snapshot\nof the archive in `epi_df` format. This represents the most up-to-date values of\nthe signal variables as of a given version. This can be accessed via `x$as_of()`\nfor an `epi_archive` object `x`, but the package also provides a simple wrapper \nfunction `epix_as_of()` since this is likely a more familiar interface for users\nnot familiar with R6 (or object-oriented programming).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-6_0150335f0031c0eb619a4ab5e1b2b899'}\n\n```{.r .cell-code}\nx_snapshot <- epix_as_of(x, max_version = as.Date(\"2021-06-01\"))\nclass(x_snapshot)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nx_snapshot\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,460 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2021-06-01\n#> \n#> # A tibble: 1,460 × 3\n#> geo_value time_value percent_cli\n#> * \n#> 1 ca 2020-06-01 2.75\n#> 2 ca 2020-06-02 2.57\n#> 3 ca 2020-06-03 2.48\n#> 4 ca 2020-06-04 2.41\n#> 5 ca 2020-06-05 2.57\n#> 6 ca 2020-06-06 2.63\n#> # ℹ 1,454 more rows\n```\n:::\n\n```{.r .cell-code}\nmax(x_snapshot$time_value)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-05-31\"\n```\n:::\n\n```{.r .cell-code}\nattributes(x_snapshot)$metadata$as_of\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-06-01\"\n```\n:::\n:::\n\n\nWe can see that the max time value in the `epi_df` object `x_snapshot` that was \ngenerated from the archive is May 29, 2021, even though the specified version\ndate was June 1, 2021. From this we can infer that the doctor's visits signal\nwas 2 days latent on June 1. Also, we can see that the metadata in the `epi_df`\nobject has the version date recorded in the `as_of` field.\n\nBy default, using the maximum of the `version` column in the underlying data table in an\n`epi_archive` object itself generates a snapshot of the latest values of signal\nvariables in the entire archive. The `epix_as_of()` function issues a warning in\nthis case, since updates to the current version may still come in at a later \npoint in time, due to various reasons, such as synchronization issues.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-7_d5f40a899e63f06b4b5411a752857f2a'}\n\n```{.r .cell-code}\nx_latest <- epix_as_of(x, max_version = max(x$DT$version))\n```\n:::\n\n\nBelow, we pull several snapshots from the archive, spaced one month apart. We\noverlay the corresponding signal curves as colored lines, with the version dates\nmarked by dotted vertical lines, and draw the latest curve in black (from the \nlatest snapshot `x_latest` that the archive can provide).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-8_204613e6af4268fe83f46e1635e0ba9e'}\n\n```{.r .cell-code}\nself_max <- max(x$DT$version)\nversions <- seq(as.Date(\"2020-06-01\"), self_max - 1, by = \"1 month\")\nsnapshots <- map(\n versions,\n function(v) {\n epix_as_of(x, max_version = v) %>% mutate(version = v)\n }\n) %>%\n list_rbind() %>%\n bind_rows(x_latest %>% mutate(version = self_max)) %>%\n mutate(latest = version == self_max)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-9_abb01f2c77a56adc9b3456f605179f88'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n snapshots %>% filter(!latest),\n aes(x = time_value, y = percent_cli)\n) +\n geom_line(aes(color = factor(version)), na.rm = TRUE) +\n geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_color_viridis_d(option = \"A\", end = .9) +\n labs(x = \"Date\", y = \"% of doctor's visits with CLI\") +\n theme(legend.position = \"none\") +\n geom_line(\n data = snapshots %>% filter(latest),\n aes(x = time_value, y = percent_cli),\n inherit.aes = FALSE, color = \"black\", na.rm = TRUE\n )\n```\n\n::: {.cell-output-display}\n![](archive_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see some interesting and highly nontrivial revision behavior: at some\npoints in time the provisional data snapshots grossly underestimate the latest\ncurve (look in particular at Florida close to the end of 2021), and at others\nthey overestimate it (both states towards the beginning of 2021), though not \nquite as dramatically. Modeling the revision process, which is often called\n*backfill modeling*, is an important statistical problem in it of itself.\n\n\n## Merging `epi_archive` objects \n\nNow we demonstrate how to merge two `epi_archive` objects together, e.g., so\nthat grabbing data from multiple sources as of a particular version can be\nperformed with a single `as_of` call. The `epi_archive` class provides a method\n`merge()` precisely for this purpose. The wrapper function is called\n`epix_merge()`; this wrapper avoids mutating its inputs, while `x$merge` will\nmutate `x`. Below we merge the working `epi_archive` of versioned percentage CLI\nfrom outpatient visits to another one of versioned COVID-19 case reporting data,\nwhich we fetch the from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the\nrate scale (counts per 100,000 people in the population).\n\nWhen merging archives, unless the archives have identical data release patterns,\n`NA`s can be introduced in the non-key variables for a few reasons:\n- to represent the \"value\" of an observation before its initial release (when we\n need to pair it with additional observations from the other archive that have\n been released)\n- to represent the \"value\" of an observation that has no recorded versions at\n all (in the same sort of situation)\n- if requested via `sync = \"na\"`, to represent potential update data that we do\n not yet have access to (e.g., due to encountering issues while attempting to\n download the currently available version data for one of the archives, but not\n the other).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-10_f17506759a99a453bf60434e742adfa5'}\n\n```{.r .cell-code}\n# This code is for illustration and doesn't run.\n# The result is saved/loaded in the (hidden) next chunk from `{epidatasets}`\ny <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_7dav_incidence_prop\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200601, 20211201),\n geo_values = \"ca,fl,ny,tx\",\n issues = epirange(20200601, 20211201)\n) %>%\n select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%\n as_epi_archive(compactify = TRUE)\n\nx$merge(y, sync = \"locf\", compactify = FALSE)\nprint(x)\nhead(x$DT)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-11_02fcba02d29e69cfaaf1db0683d5eb4c'}\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_archive` object, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> ----------\n#> * min time value = 2020-06-01\n#> * max time value = 2021-11-30\n#> * first version with update = 2020-06-02\n#> * last version with update = 2021-12-01\n#> * No clobberable versions\n#> * versions end = 2021-12-01\n#> ----------\n#> Data archive (stored in DT field): 129638 x 5\n#> Columns in DT: geo_value, time_value, version, percent_cli and 1 more columns\n#> ----------\n#> Public R6 methods: initialize, print, as_of, fill_through_version, \n#> truncate_versions_after, merge, group_by, slide, clone\n```\n:::\n\n::: {.cell-output .cell-output-stdout}\n```\n#> geo_value time_value version percent_cli case_rate_7d_av\n#> 1: ca 2020-06-01 2020-06-02 NA 6.628329\n#> 2: ca 2020-06-01 2020-06-06 2.140116 6.628329\n#> 3: ca 2020-06-01 2020-06-07 2.140116 6.628329\n#> 4: ca 2020-06-01 2020-06-08 2.140379 6.628329\n#> 5: ca 2020-06-01 2020-06-09 2.114430 6.628329\n#> 6: ca 2020-06-01 2020-06-10 2.133677 6.628329\n```\n:::\n:::\n\n\nImportantly, see that `x$merge` mutated `x` to hold the result of the merge. We\ncould also have used `xy = epix_merge(x, y)` to avoid mutating `x`. See the\ndocumentation for either for more detailed descriptions of what mutation,\npointer aliasing, and pointer reseating is possible.\n\n## Sliding version-aware computations\n \n::: {.callout-note}\nTODO: need a simple example here.\n:::\n", + "markdown": "# Work with archive objects and data revisions\n\nIn addition to the `epi_df` data structure, which we have been working with all\nalong in these vignettes, the `epiprocess` package has a companion structure\ncalled `epi_archive`. In comparison to an `epi_df` object, which can be seen as\nstoring a single snapshot of a data set with the most up-to-date signal values\nas of some given time, an `epi_archive` object stores the full version history\nof a data set. Many signals of interest for epidemiological tracking are subject\nto revision (some more than others), and paying attention to data revisions can\nbe important for all sorts of downstream data analysis and modeling tasks.\n\nThis chapter walks through working with `epi_archive` objects and demonstrates\nsome of their key functionality. We'll work with a signal on the percentage of\ndoctor's visits with CLI (COVID-like illness) computed from medical insurance\nclaims, available through the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This\nsignal is subject to very heavy and regular revision; you can read more about it\non its [API documentation\npage](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`.\n\n\n\n\n\n\n## Getting data into `epi_archive` format\n\nAn `epi_archive` object can be constructed from a data frame, data table, or\ntibble, provided that it has (at least) the following columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n* `version`: the time value specifying the version for each row of measurements.\n For example, if in a given row the `version` is January 15, 2022 and\n `time_value` is January 14, 2022, then this row contains the measurements of\n the data for January 14, 2022 that were available one day later.\n\nAs we can see from the above, the data frame returned by\n`epidatr::covidcast()` has the columns required for the `epi_archive`\nformat, so we use\n`as_epi_archive()` to cast it into `epi_archive` format.[^1]\n\n[^1]: For a discussion of the removal of\nredundant version updates in `as_epi_archive` using compactify, please refer\nto the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/compactify.html).\n\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-2_39c5cbdbb56253b327ea66e6ab4e8220'}\n\n```{.r .cell-code}\nx <- archive_cases_dv_subset_dt %>%\n select(geo_value, time_value, version, percent_cli) %>%\n as_epi_archive(compactify = TRUE)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_archive\"\n```\n:::\n\n```{.r .cell-code}\nprint(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> → An `epi_archive` object, with metadata:\n#> ℹ Min/max time values: 2020-06-01 / 2021-11-30\n#> ℹ First/last version with update: 2020-06-02 / 2021-12-01\n#> ℹ Versions end: 2021-12-01\n#> ℹ A preview of the table (119316 rows x 4 columns):\n#> Key: \n#> geo_value time_value version percent_cli\n#> \n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> --- \n#> 119312: tx 2021-11-26 2021-11-29 1.858596\n#> 119313: tx 2021-11-27 2021-11-28 NA\n#> 119314: tx 2021-11-28 2021-11-29 NA\n#> 119315: tx 2021-11-29 2021-11-30 NA\n#> 119316: tx 2021-11-30 2021-12-01 NA\n```\n:::\n:::\n\n\nAn `epi_archive` is an S3 class. Its primary field is a data table `DT`, which\nis of class `data.table` (from the `data.table` package), and has columns\n`geo_value`, `time_value`, `version`, as well as any number of additional\ncolumns.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-3_99d23f4e3321a367498344c4b6282562'}\n\n```{.r .cell-code}\nclass(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"data.table\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Key: \n#> geo_value time_value version percent_cli\n#> \n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n:::\n\n\nThe variables `geo_value`, `time_value`, `version` serve as **key variables**\nfor the data table, as well as any other specified in the metadata (described\nbelow). There can only be a single row per unique combination of key variables,\nand therefore the key variables are critical for figuring out how to generate a\nsnapshot of data from the archive, as of a given version (also described below).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-4_8b3712fe1140194d1eb702521cf15238'}\n\n```{.r .cell-code}\nkey(x$DT)\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in key(x$DT): could not find function \"key\"\n```\n:::\n:::\n\n\nIn general, the last version of each observation is carried forward (LOCF) to\nfill in data between recorded versions.\n\n## Some details on metadata\n\nThe following pieces of metadata are included as fields in an `epi_archive`\nobject:\n\n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `additional_metadata`: list of additional metadata for the data archive.\n\nMetadata for an `epi_archive` object `x` can be accessed (and altered) directly,\nas in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function\n`as_epi_archive()` attempts to guess metadata fields when an `epi_archive`\nobject is instantiated, if they are not explicitly specified in the function\ncall (as it did in the case above).\n\n## Producing snapshots in `epi_df` form\n\nA key method of an `epi_archive` class is `as_of()`, which generates a snapshot\nof the archive in `epi_df` format. This represents the most up-to-date values of\nthe signal variables as of a given version. This can be accessed via\n`epix_as_of()`.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-5_d82372bbd8143517377c9afe9103cce8'}\n\n```{.r .cell-code}\nx_snapshot <- epix_as_of(x, max_version = as.Date(\"2021-06-01\"))\nclass(x_snapshot)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nx_snapshot\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,460 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2021-06-01\n#> \n#> # A tibble: 1,460 × 3\n#> geo_value time_value percent_cli\n#> * \n#> 1 ca 2020-06-01 2.75\n#> 2 ca 2020-06-02 2.57\n#> 3 ca 2020-06-03 2.48\n#> 4 ca 2020-06-04 2.41\n#> 5 ca 2020-06-05 2.57\n#> 6 ca 2020-06-06 2.63\n#> # ℹ 1,454 more rows\n```\n:::\n\n```{.r .cell-code}\nmax(x_snapshot$time_value)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-05-31\"\n```\n:::\n\n```{.r .cell-code}\nattributes(x_snapshot)$metadata$as_of\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-06-01\"\n```\n:::\n:::\n\n\nWe can see that the max time value in the `epi_df` object `x_snapshot` that was\ngenerated from the archive is May 29, 2021, even though the specified version\ndate was June 1, 2021. From this we can infer that the doctor's visits signal\nwas 2 days latent on June 1. Also, we can see that the metadata in the `epi_df`\nobject has the version date recorded in the `as_of` field.\n\nBy default, using the maximum of the `version` column in the underlying data table in an\n`epi_archive` object itself generates a snapshot of the latest values of signal\nvariables in the entire archive. The `epix_as_of()` function issues a warning in\nthis case, since updates to the current version may still come in at a later\npoint in time, due to various reasons, such as synchronization issues.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-6_ae9ceb907b24026cb708ea184ff52cc4'}\n\n```{.r .cell-code}\nx_latest <- epix_as_of(x, max_version = max(x$DT$version))\n```\n:::\n\n\nBelow, we pull several snapshots from the archive, spaced one month apart. We\noverlay the corresponding signal curves as colored lines, with the version dates\nmarked by dotted vertical lines, and draw the latest curve in black (from the\nlatest snapshot `x_latest` that the archive can provide).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-7_7b41b3bd4e404515018a4c8d6293057d'}\n\n```{.r .cell-code}\nself_max <- max(x$DT$version)\nversions <- seq(as.Date(\"2020-06-01\"), self_max - 1, by = \"1 month\")\nsnapshots <- map(\n versions,\n function(v) {\n epix_as_of(x, max_version = v) %>% mutate(version = v)\n }\n) %>%\n list_rbind() %>%\n bind_rows(x_latest %>% mutate(version = self_max)) %>%\n mutate(latest = version == self_max)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-8_8625834090bf668df1c1c2bcae527e81'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n snapshots %>% filter(!latest),\n aes(x = time_value, y = percent_cli)\n) +\n geom_line(aes(color = factor(version)), na.rm = TRUE) +\n geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_color_viridis_d(option = \"A\", end = .9) +\n labs(x = \"Date\", y = \"% of doctor's visits with CLI\") +\n theme(legend.position = \"none\") +\n geom_line(\n data = snapshots %>% filter(latest),\n aes(x = time_value, y = percent_cli),\n inherit.aes = FALSE, color = \"black\", na.rm = TRUE\n )\n```\n\n::: {.cell-output-display}\n![](archive_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see some interesting and highly nontrivial revision behavior: at some\npoints in time the provisional data snapshots grossly underestimate the latest\ncurve (look in particular at Florida close to the end of 2021), and at others\nthey overestimate it (both states towards the beginning of 2021), though not\nquite as dramatically. Modeling the revision process, which is often called\n*backfill modeling*, is an important statistical problem in it of itself.\n\n\n## Merging `epi_archive` objects\n\nNow we demonstrate how to merge two `epi_archive` objects together, e.g., so\nthat grabbing data from multiple sources as of a particular version can be\nperformed with a single `as_of` call. The `epiprocess` packages provides\n`epix_merge()` for this purpose. Below we merge the working `epi_archive` of\nversioned percentage CLI from outpatient visits to another one of versioned\nCOVID-19 case reporting data, which we fetch the from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the\nrate scale (counts per 100,000 people in the population).\n\nWhen merging archives, unless the archives have identical data release patterns,\n`NA`s can be introduced in the non-key variables for a few reasons:\n- to represent the \"value\" of an observation before its initial release (when we\n need to pair it with additional observations from the other archive that have\n been released)\n- to represent the \"value\" of an observation that has no recorded versions at\n all (in the same sort of situation)\n- if requested via `sync = \"na\"`, to represent potential update data that we do\n not yet have access to (e.g., due to encountering issues while attempting to\n download the currently available version data for one of the archives, but not\n the other).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-9_dd8b9566b93ee92760a9756afba10db6'}\n\n```{.r .cell-code}\n# This code is for illustration and doesn't run.\n# The result is saved/loaded in the (hidden) next chunk from `{epidatasets}`\ny <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_7dav_incidence_prop\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200601, 20211201),\n geo_values = \"ca,fl,ny,tx\",\n issues = epirange(20200601, 20211201)\n) %>%\n select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%\n as_epi_archive(compactify = TRUE)\n\nx <- epix_merge(x, y, sync = \"locf\", compactify = FALSE)\nprint(x)\nhead(x$DT)\n```\n:::\n\n\n## Sliding version-aware computations\n\n::: {.callout-note}\nTODO: need a simple example here.\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/archive/figure-html/unnamed-chunk-8-1.svg b/_freeze/archive/figure-html/unnamed-chunk-8-1.svg new file mode 100644 index 0000000..9c9e89c --- /dev/null +++ b/_freeze/archive/figure-html/unnamed-chunk-8-1.svgdiff --git a/_freeze/index/execute-results/html.json b/_freeze/index/execute-results/html.json index 4cc6c85..fa6d283 100644 --- a/_freeze/index/execute-results/html.json +++ b/_freeze/index/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "581a82e7d09ad8a4454b6d07d65c6e0c", + "hash": "56d86a42ec6ac3c6af6193a965b2076c", "result": { - "markdown": "---\ntoc-depth: 2\nnocite: |\n @*\n---\n\n\n# Preface {.unnumbered}\n\n\n::: {.cell}\n\n:::\n\n\n::: {.callout-caution}\nThis book is still under construction and may not yet be fully self-contained or reproducible. But it hopefully will be!\n:::\n\nThis book describes some of the functionality of the\n`{epiprocess}` and `{epipredict}` R packages, with an eye toward creating various types of signal processing and forecast creation for epidemiological data. The goal is to be able to load, inspect, process, and forecast\n --- using simple baselines to more elaborate customizations. \n\n## Installation {#sec-installation}\n\n\n\nThe following commands install the latest versions of the packages we use in this book:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-2_ef21555fe232decfa5c5e6ce81cfc532'}\n\n```{.r .cell-code}\n# install.packages(\"pak\")\n\n# Install our packages from GitHub:\npak::pkg_install(\"cmu-delphi/epidatr\")\npak::pkg_install(\"cmu-delphi/epiprocess\")\npak::pkg_install(\"cmu-delphi/epipredict\")\npak::pkg_install(\"cmu-delphi/epidatasets\")\n# Other model-fitting packages we use in this book (via epipredict):\npak::pkg_install(\"poissonreg\")\npak::pkg_install(\"ranger\")\npak::pkg_install(\"xgboost\")\n# Other data processing, model evaluation, example data, and other packages we\n# use in this book:\npak::pkg_install(\"RcppRoll\")\npak::pkg_install(\"tidyverse\")\npak::pkg_install(\"tidymodels\")\npak::pkg_install(\"broom\")\npak::pkg_install(\"performance\")\npak::pkg_install(\"modeldata\")\npak::pkg_install(\"see\")\npak::pkg_install(\"sessioninfo\")\n```\n:::\n\n\nMuch of the data used for illustration can be loaded directly from [Delphi's Epidata API](https://cmu-delphi.github.io/delphi-epidata/) which is built and maintained by the Carnegie Mellon University [Delphi research group](https://delphi.cmu.edu/). We have tried to provide most of the data used in these examples in a separate package, `{epidatasets}`, but it can also be accessed using `{epidatr}`, an R interface to the API and the successor to [`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/). These are also available from GitHub:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-3_6a7154f9225b9bd3d6ffa321b2da25af'}\n\n```{.r .cell-code}\npak::pkg_install(\"cmu-delphi/epidatasets\")\npak::pkg_install(\"cmu-delphi/epidatr\")\n```\n:::\n\n\n\n
Encountering installation issues? Click here to show some potential solutions. \n\n### Linux installation issues: compilation errors or slowness\n\nIf you are using Linux and encounter any compilation errors above, or if\ncompilation is taking very long, you might try using the RStudio (now called\nPosit) Package Manager to install binaries. You can try running this command\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-4_1c0021dec1f71a9fdbdd1a577b67f72a'}\n\n```{.r .cell-code}\noptions(\n repos = c(\n # contains binaries for Linux:\n RSPM = \"https://packagemanager.rstudio.com/all/latest\",\n # backup CRAN mirror of your choice:\n CRAN = \"https://cran.rstudio.com/\"\n )\n)\n```\n:::\n\n\n### Reproducibility\n\nThe above commands will give you the current versions of the packages used in\nthis book. If you're having trouble reproducing some of the results, it may be\ndue to package updates that took place after the book was last updated. To match\nthe versions we used to generate this book, you can use the steps below.\n\n#### First: set up and store a GitHub PAT\n\nIf you don't already have a GitHub PAT, you can use the following helper functions to create one:\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-5_8e6231bc239928b163de90b8ac90ad95'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(\"usethis\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Installing usethis [2.2.2] ...\n#> \tOK [linked cache in 0.21 milliseconds]\n#> * Installed 1 package in 1.2 seconds.\n```\n:::\n\n```{.r .cell-code}\nusethis::create_github_token(\n scopes = \"public_repo\",\n description = \"For public repo access\"\n)\n```\n:::\n\nThis will open a web browser window allowing you to describe and customize\nsettings of the PAT. Scroll to the bottom and click \"Generate\ntoken\". You'll see a screen that has `ghp_` with a green background; you can click the two-squares (\"copy\") icon to copy this `ghp_......` string to the clipboard.\n\n#### Either A: Download and use the `renv.lock`\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-6_298fc2b073cbedc2c6a201948f33aed0'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(c(\"renv\", \"gitcreds\"))\ndownload.file(\"https://raw.githubusercontent.com/cmu-delphi/delphi-tooling-book/main/renv.lock\", \"delphi-tooling-book.renv.lock\")\n\n# Run this in a fresh session each time you'd like to use this set of versions.\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::use(lockfile = \"delphi-tooling-book.renv.lock\")\n# If you get 401 errors, you may need to regenerate your GitHub PAT or check if\n# `gitcreds::gitcreds_get()` is detecting an old PAT you have saved somewhere.\n```\n:::\n\n\n#### Or B: Download the book and use its `.Rprofile`\n\n1. Download the book [here](https://github.com/cmu-delphi/delphi-tooling-book/archive/refs/heads/main.zip) and unzip it.\n2. One-time setup: launch R inside the delphi-tooling-book directory (to use its\n `.Rprofile` file) and run\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-7_5f193d921c8c0c99eb8e67696fb70a8b'}\n\n```{.r .cell-code}\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::restore() # downloads the appropriate package versions\n```\n:::\n\n\n3. To use this set of versions: launch R inside the delphi-tooling-book directory.\n\n### Other issues\n\nPlease let us know! You can file an issue with the book [here](https://github.com/cmu-delphi/delphi-tooling-book/issues), or with one of the individual packages at their own issue pages: [epidatr](https://github.com/cmu-delphi/epidatr/issues), [epiprocess](https://github.com/cmu-delphi/epiprocess/issues), [epipredict](https://github.com/cmu-delphi/epipredict/issues).\n\n
\n\n\n## Documentation\n\nYou can view the complete documentation for these packages at \n\n* ,\n* ,\n* ,\n* .\n\n## Attribution\n\nThis document contains a number of datasets that are a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). These data are licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.\n\n[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): \n These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. \n\n\n\n## Quick-start example\n\nThese packages come with some built-in historical data for illustration, but\nup-to-date versions could be downloaded with the\n[`{epidatr}`](https://cmu-delphi.github.io/epidatr) or \n[`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/index.html) \npackages and processed using\n[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^index1]\n\n[^index1]: COVIDcast data and other epidemiological signals for non-Covid related illnesses are available with [`{epidatr}`](https://cmu-delphi.github.io/epidatr), which interfaces directly to Delphi's [Epidata API](https://cmu-delphi.github.io/delphi-epidata/).\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/epidf_48a293db285163cbde5b55b5d6115276'}\n\n```{.r .cell-code}\nlibrary(epipredict)\njhu <- case_death_rate_subset\njhu\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 20,496 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25\n#> \n#> # A tibble: 20,496 × 4\n#> geo_value time_value case_rate death_rate\n#> * \n#> 1 ak 2020-12-31 35.9 0.158\n#> 2 al 2020-12-31 65.1 0.438\n#> 3 ar 2020-12-31 66.0 1.27 \n#> 4 as 2020-12-31 0 0 \n#> 5 az 2020-12-31 76.8 1.10 \n#> 6 ca 2020-12-31 96.0 0.751\n#> # ℹ 20,490 more rows\n```\n:::\n:::\n\n\nTo create and train a simple auto-regressive forecaster to predict the death rate two weeks into the future using past (lagged) deaths and cases, we could use the following function.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/make-forecasts_1d4f30fdea1cd893cb39553fa0f1d21c'}\n\n```{.r .cell-code}\ntwo_week_ahead <- arx_forecaster(\n jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n lags = list(case_rate = c(0, 1, 2, 3, 7, 14), death_rate = c(0, 7, 14)),\n ahead = 14\n )\n)\n```\n:::\n\n\nIn this case, we have used a number of different lags for the case rate, while only using 3 weekly lags for the death rate (as predictors). The result is both a fitted model object which could be used any time in the future to create different forecasts, as well as a set of predicted values (and prediction intervals) for each location 14 days after the last available time value in the data.\n\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/print-model_f5f6b4212b46903845381e0a40889efc'}\n\n```{.r .cell-code}\ntwo_week_ahead$epi_workflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ══ Epi Workflow [trained] ═══════════════════════════════════════════════════\n#> Preprocessor: Recipe\n#> Model: linear_reg()\n#> Postprocessor: Frosting\n#> \n#> ── Preprocessor ─────────────────────────────────────────────────────────────\n#> 6 Recipe Steps\n#> \n#> \n#> ── Model ────────────────────────────────────────────────────────────────────\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) lag_0_case_rate lag_1_case_rate lag_2_case_rate \n#> -0.0073358 0.0030365 0.0012467 0.0009536 \n#> lag_3_case_rate lag_7_case_rate lag_14_case_rate lag_0_death_rate \n#> 0.0011425 0.0012481 0.0003041 0.1351769 \n#> lag_7_death_rate lag_14_death_rate \n#> 0.1471127 0.1062473 \n#> \n#> ── Postprocessor ────────────────────────────────────────────────────────────\n#> 5 Frosting Layers\n```\n:::\n:::\n\n\nThe fitted model here involved preprocessing the data to appropriately generate lagged predictors, estimating a linear model with `stats::lm()` and then postprocessing the results to be meaningful for epidemiological tasks. We can also examine the predictions.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/show-preds_4bf0ca6ef427c01aa0a0686ab430f93d'}\n\n```{.r .cell-code}\ntwo_week_ahead$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.449 quantiles(0.45)[2] 2021-12-31 2022-01-14 \n#> 2 al 0.574 quantiles(0.57)[2] 2021-12-31 2022-01-14 \n#> 3 ar 0.673 quantiles(0.67)[2] 2021-12-31 2022-01-14 \n#> 4 as 0 quantiles(0.12)[2] 2021-12-31 2022-01-14 \n#> 5 az 0.679 quantiles(0.68)[2] 2021-12-31 2022-01-14 \n#> 6 ca 0.575 quantiles(0.57)[2] 2021-12-31 2022-01-14 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe results above show a distributional forecast produced using data through the end of 2021 for the 14th of January 2022. A prediction for the death rate per 100K inhabitants is available for every state (`geo_value`) along with a 90% predictive interval. The figure below\ndisplays the forecast for a small handful of states. The vertical black line is the forecast date. The forecast doesn't appear to be particularly good, but our choices above were intended to be illustrative of the functionality rather than optimized for accuracy.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-8_1204ca19d449319e90b2fd3763e63dfd'}\n\n```{.r .cell-code code-fold=\"true\"}\nsamp_geos <- c(\"ca\", \"co\", \"ny\", \"pa\")\n\nhist <- jhu %>%\n filter(\n geo_value %in% samp_geos,\n time_value >= max(time_value) - 90L\n )\n\npreds <- two_week_ahead$predictions %>%\n filter(geo_value %in% samp_geos) %>%\n pivot_quantiles_wider(.pred_distn)\n\nggplot(hist, aes(color = geo_value)) +\n geom_line(aes(time_value, death_rate)) +\n theme_bw() +\n geom_errorbar(data = preds, aes(x = target_date, ymin = `0.05`, ymax = `0.95`)) +\n geom_point(data = preds, aes(target_date, .pred)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d(name = \"\") +\n scale_x_date(date_labels = \"%b %Y\") +\n theme(legend.position = \"bottom\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\")\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Contents\n\nThe remainder of this book examines this software in more detail, illustrating some of the flexibility that is available.\n\n---\n\n
Session Information. \n\nSee also @sec-installation.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-9_c866f0d1d0a1809a33be44cd8b8eec3f'}\n\n```{.r .cell-code}\nsessioninfo::session_info()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ─ Session info ────────────────────────────────────────────────────────────\n#> setting value\n#> version R version 4.1.3 (2022-03-10)\n#> os Fedora Linux 36 (Workstation Edition)\n#> system x86_64, linux-gnu\n#> ui X11\n#> language (EN)\n#> collate en_US.UTF-8\n#> ctype en_US.UTF-8\n#> tz America/Los_Angeles\n#> date 2023-12-15\n#> pandoc 2.14.0.3 @ /usr/bin/ (via rmarkdown)\n#> \n#> ─ Packages ────────────────────────────────────────────────────────────────\n#> ! package * version date (UTC) lib source\n#> P anytime 0.3.9 2020-08-27 [?] RSPM (R 4.1.3)\n#> P askpass 1.1 2019-01-13 [?] CRAN (R 4.0.5)\n#> P backports 1.4.1 2021-12-13 [?] CRAN (R 4.0.5)\n#> P cachem 1.0.8 2023-05-01 [?] RSPM (R 4.1.3)\n#> P checkmate 2.2.0 2023-04-27 [?] RSPM (R 4.1.3)\n#> P class 7.3-22 2023-05-03 [?] CRAN (R 4.1.3)\n#> P cli 3.6.1 2023-03-23 [?] RSPM (R 4.1.3)\n#> P codetools 0.2-19 2023-02-01 [?] RSPM (R 4.1.3)\n#> P colorspace 2.1-0 2023-01-23 [?] RSPM (R 4.1.3)\n#> P crayon 1.5.2 2022-09-29 [?] RSPM\n#> P data.table 1.14.8 2023-02-17 [?] RSPM (R 4.1.3)\n#> P digest 0.6.31 2022-12-11 [?] RSPM (R 4.1.3)\n#> P distributional 0.3.2 2023-03-22 [?] RSPM (R 4.1.3)\n#> P dplyr * 1.1.2 2023-04-20 [?] RSPM (R 4.1.3)\n#> P ellipsis 0.3.2 2021-04-29 [?] CRAN (R 4.0.5)\n#> P epidatasets * 0.0.1 2023-06-20 [?] Github (cmu-delphi/epidatasets@cc8f2a0)\n#> P epidatr * 1.0.0.9000 2023-12-15 [?] Github (cmu-delphi/epidatr@6e9f899)\n#> P epipredict * 0.0.6 2023-11-08 [?] Github (cmu-delphi/epipredict@378577a)\n#> P epiprocess * 0.7.0.9999 2023-12-15 [?] Github (cmu-delphi/epiprocess@b444a3c)\n#> P evaluate 0.21 2023-05-05 [?] RSPM (R 4.1.3)\n#> P fansi 1.0.4 2023-01-22 [?] RSPM (R 4.1.3)\n#> P farver 2.1.1 2022-07-06 [?] RSPM (R 4.1.3)\n#> P fastmap 1.1.1 2023-02-24 [?] RSPM (R 4.1.3)\n#> P forcats * 1.0.0 2023-01-29 [?] RSPM\n#> P fs 1.6.2 2023-04-25 [?] RSPM (R 4.1.3)\n#> P future 1.32.0 2023-03-07 [?] RSPM\n#> P future.apply 1.11.0 2023-05-21 [?] RSPM\n#> P generics 0.1.3 2022-07-05 [?] RSPM (R 4.1.3)\n#> P ggplot2 * 3.4.2 2023-04-03 [?] RSPM (R 4.1.3)\n#> P globals 0.16.2 2022-11-21 [?] RSPM (R 4.1.3)\n#> P glue 1.6.2 2022-02-24 [?] CRAN (R 4.0.5)\n#> P gower 1.0.1 2022-12-22 [?] RSPM\n#> P gtable 0.3.3 2023-03-21 [?] RSPM (R 4.1.3)\n#> P hardhat 1.3.0 2023-03-30 [?] RSPM (R 4.1.3)\n#> P hms 1.1.3 2023-03-21 [?] RSPM\n#> P htmltools 0.5.5 2023-03-23 [?] RSPM (R 4.1.3)\n#> P httr 1.4.6 2023-05-08 [?] CRAN (R 4.1.3)\n#> P ipred 0.9-14 2023-03-09 [?] RSPM\n#> P jsonlite 1.8.5 2023-06-05 [?] RSPM (R 4.1.3)\n#> P knitr 1.43 2023-05-25 [?] RSPM (R 4.1.3)\n#> P labeling 0.4.2 2020-10-20 [?] CRAN (R 4.0.5)\n#> P lattice 0.21-8 2023-04-05 [?] RSPM (R 4.1.3)\n#> P lava 1.7.2.1 2023-02-27 [?] RSPM\n#> P lifecycle 1.0.3 2022-10-07 [?] RSPM (R 4.1.3)\n#> P listenv 0.9.0 2022-12-16 [?] RSPM\n#> P lubridate * 1.9.2 2023-02-10 [?] CRAN (R 4.1.3)\n#> P magrittr 2.0.3 2022-03-30 [?] CRAN (R 4.0.5)\n#> P MASS 7.3-60 2023-05-04 [?] RSPM (R 4.1.3)\n#> P Matrix 1.5-4 2023-04-04 [?] CRAN (R 4.1.3)\n#> P MatrixModels 0.5-1 2022-09-11 [?] RSPM (R 4.1.3)\n#> P MMWRweek 0.1.3 2020-04-22 [?] RSPM (R 4.1.3)\n#> P munsell 0.5.0 2018-06-12 [?] CRAN (R 4.0.5)\n#> P nnet 7.3-19 2023-05-03 [?] RSPM (R 4.1.3)\n#> P openssl 2.0.6 2023-03-09 [?] RSPM (R 4.1.3)\n#> P parallelly 1.36.0 2023-05-26 [?] RSPM\n#> P parsnip * 1.1.0 2023-04-12 [?] RSPM (R 4.1.3)\n#> P pillar 1.9.0 2023-03-22 [?] RSPM (R 4.1.3)\n#> P pkgconfig 2.0.3 2019-09-22 [?] CRAN (R 4.1.3)\n#> P prodlim 2023.03.31 2023-04-02 [?] RSPM\n#> P purrr * 1.0.1 2023-01-10 [?] RSPM (R 4.1.3)\n#> P quantreg 5.95 2023-04-08 [?] RSPM (R 4.1.3)\n#> P R.cache 0.16.0 2022-07-21 [?] RSPM (R 4.1.3)\n#> P R.methodsS3 1.8.2 2022-06-13 [?] RSPM (R 4.1.3)\n#> P R.oo 1.25.0 2022-06-12 [?] RSPM (R 4.1.3)\n#> P R.utils 2.12.2 2022-11-11 [?] RSPM (R 4.1.3)\n#> P R6 2.5.1 2021-08-19 [?] CRAN (R 4.0.5)\n#> P Rcpp 1.0.10 2023-01-22 [?] RSPM (R 4.1.3)\n#> P readr * 2.1.4 2023-02-10 [?] RSPM\n#> P recipes 1.0.6 2023-04-25 [?] RSPM (R 4.1.3)\n#> P renv 0.17.3 2023-04-06 [?] RSPM (R 4.1.3)\n#> P rlang 1.1.1 2023-04-28 [?] RSPM (R 4.1.3)\n#> P rmarkdown 2.22 2023-06-01 [?] RSPM (R 4.1.3)\n#> P rpart 4.1.19 2022-10-21 [?] RSPM (R 4.1.3)\n#> P rstudioapi 0.14 2022-08-22 [?] RSPM (R 4.1.3)\n#> P scales 1.2.1 2022-08-20 [?] RSPM (R 4.1.3)\n#> P sessioninfo 1.2.2 2021-12-06 [?] CRAN (R 4.1.3)\n#> P smoothqr 0.1.1 2023-06-20 [?] Github (dajmcdon/smoothqr@3def5f0)\n#> P SparseM 1.81 2021-02-18 [?] RSPM (R 4.1.3)\n#> P stringi 1.7.12 2023-01-11 [?] RSPM (R 4.1.3)\n#> P stringr * 1.5.0 2022-12-02 [?] RSPM (R 4.1.3)\n#> P styler 1.10.1 2023-06-05 [?] RSPM (R 4.1.3)\n#> P survival 3.5-5 2023-03-12 [?] RSPM (R 4.1.3)\n#> P tibble * 3.2.1 2023-03-20 [?] RSPM (R 4.1.3)\n#> P tidyr * 1.3.0 2023-01-24 [?] RSPM (R 4.1.3)\n#> P tidyselect 1.2.0 2022-10-10 [?] RSPM (R 4.1.3)\n#> P tidyverse * 2.0.0 2023-02-22 [?] RSPM\n#> P timechange 0.2.0 2023-01-11 [?] CRAN (R 4.1.3)\n#> P timeDate 4022.108 2023-01-07 [?] RSPM\n#> P tsibble 1.1.3 2022-10-09 [?] RSPM (R 4.1.3)\n#> P tzdb 0.4.0 2023-05-12 [?] RSPM (R 4.1.3)\n#> P usethis 2.2.2 2023-07-06 [?] RSPM (R 4.1.3)\n#> P utf8 1.2.3 2023-01-31 [?] RSPM (R 4.1.3)\n#> P vctrs 0.6.2 2023-04-19 [?] CRAN (R 4.1.3)\n#> P viridisLite 0.4.2 2023-05-02 [?] RSPM (R 4.1.3)\n#> P withr 2.5.0 2022-03-03 [?] CRAN (R 4.0.5)\n#> P workflows 1.1.3 2023-02-22 [?] RSPM (R 4.1.3)\n#> P xfun 0.39 2023-04-20 [?] RSPM (R 4.1.3)\n#> P xml2 1.3.4 2023-04-27 [?] CRAN (R 4.1.3)\n#> P yaml 2.3.7 2023-01-23 [?] RSPM (R 4.1.3)\n#> \n#> [1] /home/fullname/.cache/R/renv/library/delphi-tooling-book-1266ecb6/R-4.1/x86_64-redhat-linux-gnu\n#> [2] /home/fullname/.cache/R/renv/sandbox/R-4.1/x86_64-redhat-linux-gnu/60c4e220\n#> \n#> P ── Loaded and on-disk path mismatch.\n#> \n#> ───────────────────────────────────────────────────────────────────────────\n```\n:::\n:::\n\n\n
\n\n\n\n", + "markdown": "---\ntoc-depth: 2\nnocite: |\n @*\n---\n\n\n# Preface {.unnumbered}\n\n\n::: {.cell}\n\n:::\n\n\n::: {.callout-caution}\nThis book is still under construction and may not yet be fully self-contained or reproducible. But it hopefully will be!\n:::\n\nThis book describes some of the functionality of the\n`{epiprocess}` and `{epipredict}` R packages, with an eye toward creating various types of signal processing and forecast creation for epidemiological data. The goal is to be able to load, inspect, process, and forecast\n --- using simple baselines to more elaborate customizations. \n\n## Installation {#sec-installation}\n\n\n\nThe following commands install the latest versions of the packages we use in this book:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-2_ef21555fe232decfa5c5e6ce81cfc532'}\n\n```{.r .cell-code}\n# install.packages(\"pak\")\n\n# Install our packages from GitHub:\npak::pkg_install(\"cmu-delphi/epidatr\")\npak::pkg_install(\"cmu-delphi/epiprocess\")\npak::pkg_install(\"cmu-delphi/epipredict\")\npak::pkg_install(\"cmu-delphi/epidatasets\")\n# Other model-fitting packages we use in this book (via epipredict):\npak::pkg_install(\"poissonreg\")\npak::pkg_install(\"ranger\")\npak::pkg_install(\"xgboost\")\n# Other data processing, model evaluation, example data, and other packages we\n# use in this book:\npak::pkg_install(\"RcppRoll\")\npak::pkg_install(\"tidyverse\")\npak::pkg_install(\"tidymodels\")\npak::pkg_install(\"broom\")\npak::pkg_install(\"performance\")\npak::pkg_install(\"modeldata\")\npak::pkg_install(\"see\")\npak::pkg_install(\"sessioninfo\")\n```\n:::\n\n\nMuch of the data used for illustration can be loaded directly from [Delphi's Epidata API](https://cmu-delphi.github.io/delphi-epidata/) which is built and maintained by the Carnegie Mellon University [Delphi research group](https://delphi.cmu.edu/). We have tried to provide most of the data used in these examples in a separate package, `{epidatasets}`, but it can also be accessed using `{epidatr}`, an R interface to the API and the successor to [`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/). These are also available from GitHub:\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-3_6a7154f9225b9bd3d6ffa321b2da25af'}\n\n```{.r .cell-code}\npak::pkg_install(\"cmu-delphi/epidatasets\")\npak::pkg_install(\"cmu-delphi/epidatr\")\n```\n:::\n\n\n\n
Encountering installation issues? Click here to show some potential solutions. \n\n### Linux installation issues: compilation errors or slowness\n\nIf you are using Linux and encounter any compilation errors above, or if\ncompilation is taking very long, you might try using the RStudio (now called\nPosit) Package Manager to install binaries. You can try running this command\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-4_1c0021dec1f71a9fdbdd1a577b67f72a'}\n\n```{.r .cell-code}\noptions(\n repos = c(\n # contains binaries for Linux:\n RSPM = \"https://packagemanager.rstudio.com/all/latest\",\n # backup CRAN mirror of your choice:\n CRAN = \"https://cran.rstudio.com/\"\n )\n)\n```\n:::\n\n\n### Reproducibility\n\nThe above commands will give you the current versions of the packages used in\nthis book. If you're having trouble reproducing some of the results, it may be\ndue to package updates that took place after the book was last updated. To match\nthe versions we used to generate this book, you can use the steps below.\n\n#### First: set up and store a GitHub PAT\n\nIf you don't already have a GitHub PAT, you can use the following helper functions to create one:\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-5_8e6231bc239928b163de90b8ac90ad95'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(\"usethis\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> The following package(s) will be installed:\n#> - usethis [2.2.3]\n#> These packages will be installed into \"~/.cache/R/renv/library/delphi-tooling-book-a509243d/R-4.3/x86_64-pc-linux-gnu\".\n#> \n#> # Installing packages -------------------------------------------------------\n#> - Installing usethis ... OK [linked from cache]\n#> Successfully installed 1 package in 8.9 milliseconds.\n```\n:::\n\n```{.r .cell-code}\nusethis::create_github_token(\n scopes = \"public_repo\",\n description = \"For public repo access\"\n)\n```\n:::\n\nThis will open a web browser window allowing you to describe and customize\nsettings of the PAT. Scroll to the bottom and click \"Generate\ntoken\". You'll see a screen that has `ghp_` with a green background; you can click the two-squares (\"copy\") icon to copy this `ghp_......` string to the clipboard.\n\n#### Either A: Download and use the `renv.lock`\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-6_298fc2b073cbedc2c6a201948f33aed0'}\n\n```{.r .cell-code}\n# Run this once:\ninstall.packages(c(\"renv\", \"gitcreds\"))\ndownload.file(\"https://raw.githubusercontent.com/cmu-delphi/delphi-tooling-book/main/renv.lock\", \"delphi-tooling-book.renv.lock\")\n\n# Run this in a fresh session each time you'd like to use this set of versions.\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::use(lockfile = \"delphi-tooling-book.renv.lock\")\n# If you get 401 errors, you may need to regenerate your GitHub PAT or check if\n# `gitcreds::gitcreds_get()` is detecting an old PAT you have saved somewhere.\n```\n:::\n\n\n#### Or B: Download the book and use its `.Rprofile`\n\n1. Download the book [here](https://github.com/cmu-delphi/delphi-tooling-book/archive/refs/heads/main.zip) and unzip it.\n2. One-time setup: launch R inside the delphi-tooling-book directory (to use its\n `.Rprofile` file) and run\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-7_5f193d921c8c0c99eb8e67696fb70a8b'}\n\n```{.r .cell-code}\n# Warning: don't save your GitHub PAT in a file you might share with others;\n# look into `gitcreds::gitcreds_set()` or `usethis::edit_r_environ()` instead.\nSys.setenv(\"GITHUB_PAT\" = \"ghp_............\")\nrenv::restore() # downloads the appropriate package versions\n```\n:::\n\n\n3. To use this set of versions: launch R inside the delphi-tooling-book directory.\n\n### Other issues\n\nPlease let us know! You can file an issue with the book [here](https://github.com/cmu-delphi/delphi-tooling-book/issues), or with one of the individual packages at their own issue pages: [epidatr](https://github.com/cmu-delphi/epidatr/issues), [epiprocess](https://github.com/cmu-delphi/epiprocess/issues), [epipredict](https://github.com/cmu-delphi/epipredict/issues).\n\n
\n\n\n## Documentation\n\nYou can view the complete documentation for these packages at \n\n* ,\n* ,\n* ,\n* .\n\n## Attribution\n\nThis document contains a number of datasets that are a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). These data are licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.\n\n[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): \n These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. \n\n\n\n## Quick-start example\n\nThese packages come with some built-in historical data for illustration, but\nup-to-date versions could be downloaded with the\n[`{epidatr}`](https://cmu-delphi.github.io/epidatr) or \n[`{covidcast}`](https://cmu-delphi.github.io/covidcast/covidcastR/index.html) \npackages and processed using\n[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^index1]\n\n[^index1]: COVIDcast data and other epidemiological signals for non-Covid related illnesses are available with [`{epidatr}`](https://cmu-delphi.github.io/epidatr), which interfaces directly to Delphi's [Epidata API](https://cmu-delphi.github.io/delphi-epidata/).\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/epidf_48a293db285163cbde5b55b5d6115276'}\n\n```{.r .cell-code}\nlibrary(epipredict)\njhu <- case_death_rate_subset\njhu\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 20,496 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-31 12:08:25.791826\n#> \n#> # A tibble: 20,496 × 4\n#> geo_value time_value case_rate death_rate\n#> * \n#> 1 ak 2020-12-31 35.9 0.158\n#> 2 al 2020-12-31 65.1 0.438\n#> 3 ar 2020-12-31 66.0 1.27 \n#> 4 as 2020-12-31 0 0 \n#> 5 az 2020-12-31 76.8 1.10 \n#> 6 ca 2020-12-31 96.0 0.751\n#> # ℹ 20,490 more rows\n```\n:::\n:::\n\n\nTo create and train a simple auto-regressive forecaster to predict the death rate two weeks into the future using past (lagged) deaths and cases, we could use the following function.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/make-forecasts_1d4f30fdea1cd893cb39553fa0f1d21c'}\n\n```{.r .cell-code}\ntwo_week_ahead <- arx_forecaster(\n jhu,\n outcome = \"death_rate\",\n predictors = c(\"case_rate\", \"death_rate\"),\n args_list = arx_args_list(\n lags = list(case_rate = c(0, 1, 2, 3, 7, 14), death_rate = c(0, 7, 14)),\n ahead = 14\n )\n)\n```\n:::\n\n\nIn this case, we have used a number of different lags for the case rate, while only using 3 weekly lags for the death rate (as predictors). The result is both a fitted model object which could be used any time in the future to create different forecasts, as well as a set of predicted values (and prediction intervals) for each location 14 days after the last available time value in the data.\n\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/print-model_f5f6b4212b46903845381e0a40889efc'}\n\n```{.r .cell-code}\ntwo_week_ahead$epi_workflow\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> \n#> Call:\n#> stats::lm(formula = ..y ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) lag_0_case_rate lag_1_case_rate lag_2_case_rate \n#> -0.0073358 0.0030365 0.0012467 0.0009536 \n#> lag_3_case_rate lag_7_case_rate lag_14_case_rate lag_0_death_rate \n#> 0.0011425 0.0012481 0.0003041 0.1351769 \n#> lag_7_death_rate lag_14_death_rate \n#> 0.1471127 0.1062473\n```\n:::\n:::\n\n\nThe fitted model here involved preprocessing the data to appropriately generate lagged predictors, estimating a linear model with `stats::lm()` and then postprocessing the results to be meaningful for epidemiological tasks. We can also examine the predictions.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/show-preds_4bf0ca6ef427c01aa0a0686ab430f93d'}\n\n```{.r .cell-code}\ntwo_week_ahead$predictions\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 56 × 5\n#> geo_value .pred .pred_distn forecast_date target_date\n#> \n#> 1 ak 0.449 quantiles(0.45)[2] 2021-12-31 2022-01-14 \n#> 2 al 0.574 quantiles(0.57)[2] 2021-12-31 2022-01-14 \n#> 3 ar 0.673 quantiles(0.67)[2] 2021-12-31 2022-01-14 \n#> 4 as 0 quantiles(0.12)[2] 2021-12-31 2022-01-14 \n#> 5 az 0.679 quantiles(0.68)[2] 2021-12-31 2022-01-14 \n#> 6 ca 0.575 quantiles(0.57)[2] 2021-12-31 2022-01-14 \n#> # ℹ 50 more rows\n```\n:::\n:::\n\n\nThe results above show a distributional forecast produced using data through the end of 2021 for the 14th of January 2022. A prediction for the death rate per 100K inhabitants is available for every state (`geo_value`) along with a 90% predictive interval. The figure below\ndisplays the forecast for a small handful of states. The vertical black line is the forecast date. The forecast doesn't appear to be particularly good, but our choices above were intended to be illustrative of the functionality rather than optimized for accuracy.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-8_49e3f47328fdf1fc4b3b412580923383'}\n\n```{.r .cell-code code-fold=\"true\"}\nsamp_geos <- c(\"ca\", \"co\", \"ny\", \"pa\")\n\nhist <- jhu %>%\n filter(\n geo_value %in% samp_geos,\n time_value >= max(time_value) - 90L\n )\npreds <- two_week_ahead$predictions %>%\n filter(geo_value %in% samp_geos) %>%\n pivot_quantiles_wider(.pred_distn)\n\nggplot(hist, aes(color = geo_value)) +\n geom_line(aes(time_value, death_rate)) +\n theme_bw() +\n geom_errorbar(data = preds, aes(x = target_date, ymin = `0.05`, ymax = `0.95`)) +\n geom_point(data = preds, aes(target_date, .pred)) +\n geom_vline(data = preds, aes(xintercept = forecast_date)) +\n scale_colour_viridis_d(name = \"\") +\n scale_x_date(date_labels = \"%b %Y\") +\n theme(legend.position = \"bottom\") +\n labs(x = \"\", y = \"Incident deaths per 100K\\n inhabitants\")\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Contents\n\nThe remainder of this book examines this software in more detail, illustrating some of the flexibility that is available.\n\n---\n\n
Session Information. \n\nSee also @sec-installation.\n\n\n::: {.cell layout-align=\"center\" hash='index_cache/html/unnamed-chunk-9_c866f0d1d0a1809a33be44cd8b8eec3f'}\n\n```{.r .cell-code}\nsessioninfo::session_info()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> ─ Session info ────────────────────────────────────────────────────────────\n#> setting value\n#> version R version 4.3.3 (2024-02-29)\n#> os Ubuntu 20.04.6 LTS\n#> system x86_64, linux-gnu\n#> ui X11\n#> language (EN)\n#> collate en_US.UTF-8\n#> ctype en_US.UTF-8\n#> tz America/Los_Angeles\n#> date 2024-05-01\n#> pandoc 2.5 @ /usr/bin/ (via rmarkdown)\n#> \n#> ─ Packages ────────────────────────────────────────────────────────────────\n#> ! package * version date (UTC) lib source\n#> P anytime 0.3.9 2020-08-27 [?] RSPM (R 4.3.0)\n#> askpass 1.2.0 2023-09-03 [1] RSPM\n#> backports 1.4.1 2021-12-13 [1] RSPM\n#> cachem 1.0.8 2023-05-01 [1] RSPM\n#> P checkmate 2.3.1 2023-12-04 [?] RSPM (R 4.3.0)\n#> P class 7.3-22 2023-05-03 [?] CRAN (R 4.3.1)\n#> cli 3.6.2 2023-12-11 [1] RSPM\n#> P codetools 0.2-20 2024-03-31 [?] RSPM (R 4.3.0)\n#> colorspace 2.1-0 2023-01-23 [1] RSPM\n#> crayon 1.5.2 2022-09-29 [1] RSPM\n#> data.table 1.15.4 2024-03-30 [1] RSPM\n#> digest 0.6.35 2024-03-11 [1] RSPM\n#> P distributional 0.4.0 2024-02-07 [?] RSPM\n#> dplyr * 1.1.4 2023-11-17 [1] RSPM\n#> ellipsis 0.3.2 2021-04-29 [1] RSPM\n#> P epidatasets * 0.0.1 2024-05-01 [?] Github (cmu-delphi/epidatasets@ca86f03)\n#> P epidatr * 1.1.5 2024-04-03 [?] Github (cmu-delphi/epidatr@626c30b)\n#> P epipredict * 0.0.14 2024-05-01 [?] Github (cmu-delphi/epipredict@5e50a5a)\n#> P epiprocess * 0.7.7 2024-05-01 [?] Github (cmu-delphi/epiprocess@e61e11a)\n#> evaluate 0.23 2023-11-01 [1] RSPM\n#> fansi 1.0.6 2023-12-08 [1] RSPM\n#> farver 2.1.1 2022-07-06 [1] RSPM\n#> fastmap 1.1.1 2023-02-24 [1] RSPM\n#> forcats * 1.0.0 2023-01-29 [1] RSPM\n#> fs 1.6.4 2024-04-25 [1] RSPM\n#> P future 1.33.2 2024-03-26 [?] RSPM (R 4.3.0)\n#> P future.apply 1.11.2 2024-03-28 [?] RSPM (R 4.3.0)\n#> generics 0.1.3 2022-07-05 [1] RSPM\n#> ggplot2 * 3.5.1 2024-04-23 [1] RSPM\n#> P globals 0.16.3 2024-03-08 [?] RSPM\n#> glue 1.7.0 2024-01-09 [1] RSPM\n#> P gower 1.0.1 2022-12-22 [?] RSPM (R 4.3.0)\n#> gtable 0.3.5 2024-04-22 [1] RSPM\n#> P hardhat 1.3.1 2024-02-02 [?] RSPM\n#> hms 1.1.3 2023-03-21 [1] RSPM\n#> htmltools 0.5.8.1 2024-04-04 [1] RSPM\n#> htmlwidgets 1.6.4 2023-12-06 [1] RSPM\n#> httr 1.4.7 2023-08-15 [1] RSPM\n#> P ipred 0.9-14 2023-03-09 [?] RSPM (R 4.3.0)\n#> jsonlite 1.8.8 2023-12-04 [1] RSPM\n#> knitr 1.46 2024-04-06 [1] RSPM\n#> labeling 0.4.3 2023-08-29 [1] RSPM\n#> P lattice 0.22-6 2024-03-20 [?] RSPM\n#> P lava 1.8.0 2024-03-05 [?] RSPM\n#> lifecycle 1.0.4 2023-11-07 [1] RSPM\n#> P listenv 0.9.1 2024-01-29 [?] RSPM\n#> lubridate * 1.9.3 2023-09-27 [1] RSPM\n#> magrittr 2.0.3 2022-03-30 [1] RSPM\n#> P MASS 7.3-60 2023-05-04 [?] CRAN (R 4.3.1)\n#> P Matrix 1.6-5 2024-01-11 [?] CRAN (R 4.3.3)\n#> P MatrixModels 0.5-3 2023-11-06 [?] RSPM\n#> P MMWRweek 0.1.3 2020-04-22 [?] RSPM (R 4.3.0)\n#> munsell 0.5.1 2024-04-01 [1] RSPM\n#> P nnet 7.3-19 2023-05-03 [?] CRAN (R 4.3.1)\n#> openssl 2.1.2 2024-04-21 [1] RSPM\n#> P parallelly 1.37.1 2024-02-29 [?] RSPM\n#> P parsnip * 1.2.1 2024-03-22 [?] RSPM (R 4.3.0)\n#> pillar 1.9.0 2023-03-22 [1] RSPM\n#> pkgconfig 2.0.3 2019-09-22 [1] RSPM\n#> P prodlim 2023.08.28 2023-08-28 [?] RSPM (R 4.3.0)\n#> purrr * 1.0.2 2023-08-10 [1] RSPM\n#> P quantreg 5.97 2023-08-19 [?] RSPM (R 4.3.0)\n#> R.cache 0.16.0 2022-07-21 [1] RSPM\n#> R.methodsS3 1.8.2 2022-06-13 [1] RSPM\n#> R.oo 1.26.0 2024-01-24 [1] RSPM\n#> R.utils 2.12.3 2023-11-18 [1] RSPM\n#> R6 2.5.1 2021-08-19 [1] RSPM\n#> Rcpp 1.0.12 2024-01-09 [1] RSPM\n#> readr * 2.1.5 2024-01-10 [1] RSPM\n#> P recipes 1.0.10 2024-02-18 [?] RSPM\n#> P renv 1.0.7 2024-04-11 [?] RSPM\n#> rlang 1.1.3 2024-01-10 [1] RSPM\n#> rmarkdown 2.26 2024-03-05 [1] RSPM\n#> P rpart 4.1.23 2023-12-05 [?] CRAN (R 4.3.2)\n#> scales 1.3.0 2023-11-28 [1] RSPM\n#> sessioninfo 1.2.2 2021-12-06 [1] RSPM\n#> P slider 0.3.1 2023-10-12 [?] RSPM (R 4.3.0)\n#> P smoothqr 0.1.1 2023-09-08 [?] Github (dajmcdon/smoothqr@3def5f0)\n#> P SparseM 1.81 2021-02-18 [?] RSPM (R 4.3.0)\n#> stringi 1.8.3 2023-12-11 [1] RSPM\n#> stringr * 1.5.1 2023-11-14 [1] RSPM\n#> styler 1.10.3 2024-04-07 [1] RSPM\n#> P survival 3.6-4 2024-04-24 [?] RSPM\n#> tibble * 3.2.1 2023-03-20 [1] RSPM\n#> tidyr * 1.3.1 2024-01-24 [1] RSPM\n#> tidyselect 1.2.1 2024-03-11 [1] RSPM\n#> tidyverse * 2.0.0 2023-02-22 [1] RSPM\n#> timechange 0.3.0 2024-01-18 [1] RSPM\n#> P timeDate 4032.109 2023-12-14 [?] RSPM\n#> P tsibble 1.1.4 2024-01-29 [?] RSPM\n#> tzdb 0.4.0 2023-05-12 [1] RSPM\n#> P usethis 2.2.3 2024-02-19 [?] RSPM (R 4.3.0)\n#> utf8 1.2.4 2023-10-22 [1] RSPM\n#> vctrs 0.6.5 2023-12-01 [1] RSPM\n#> viridisLite 0.4.2 2023-05-02 [1] RSPM\n#> P warp 0.2.1 2023-11-02 [?] RSPM\n#> withr 3.0.0 2024-01-16 [1] RSPM\n#> P workflows 1.1.4 2024-02-19 [?] RSPM\n#> xfun 0.43 2024-03-25 [1] RSPM\n#> xml2 1.3.6 2023-12-04 [1] RSPM\n#> yaml 2.3.8 2023-12-11 [1] RSPM\n#> \n#> [1] /home/dshemeto/.cache/R/renv/library/delphi-tooling-book-a509243d/R-4.3/x86_64-pc-linux-gnu\n#> [2] /home/dshemeto/.cache/R/renv/sandbox/R-4.3/x86_64-pc-linux-gnu/9a444a72\n#> \n#> P ── Loaded and on-disk path mismatch.\n#> \n#> ───────────────────────────────────────────────────────────────────────────\n```\n:::\n:::\n\n\n
\n\n\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/sliding-forecasters/execute-results/html.json b/_freeze/sliding-forecasters/execute-results/html.json index 4c57753..474dbe2 100644 --- a/_freeze/sliding-forecasters/execute-results/html.json +++ b/_freeze/sliding-forecasters/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "fe90988fde3d6bdd6237da37e0a6583d", + "hash": "bcaf5b1ddd41612a320d992385364b2b", "result": { - "markdown": "# Pseudo-prospective forecast inspection\n\n\n::: {.cell}\n\n:::\n\n\n\nA key function from the epiprocess package is `epi_slide()`, which allows the\nuser to apply a function or formula-based computation over variables in an\n`epi_df` over a running window of `n` time steps (see the following `epiprocess`\nvignette to go over the basics of the function: [\"Slide a computation over\nsignal values\"](https://cmu-delphi.github.io/epiprocess/articles/slide.html)).\nThe equivalent sliding method for an `epi_archive` object can be called by using\nthe wrapper function `epix_slide()` (refer to the following vignette for the\nbasics of the function: [\"Work with archive objects and data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)). The\nkey difference from `epi_slide()` is that it performs version-aware\ncomputations. That is, the function only uses data that would have been\navailable as of time t for that reference time.\n\nIn this vignette, we use `epi_slide()` and `epix_slide()` for backtesting our\n`arx_forecaster` on historical COVID-19 case data from the US and from Canada.\nMore precisely, we first demonstrate using `epi_slide()` to slide ARX\nforecasters over an `epi_df` object and compare the results obtained from using\ndifferent forecasting engines. We then compare these simple retrospective\nforecasts to more proper \"pseudoprospective\" forecasts generated using snapshots\nof the data that was available in real time, using `epix_slide()`.\n\n## Comparing different forecasting engines\n\n### Example using CLI and case data from US states \n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_d4d80a61c31b62ea1a5d61f1072177bf'}\n\n```{.r .cell-code}\nus_raw_history_dfs <-\n readRDS(system.file(\"extdata\", \"all_states_covidcast_signals.rds\",\n package = \"epipredict\", mustWork = TRUE\n ))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nAfter obtaining the latest snapshot of the data, we produce forecasts on that\ndata using the default engine of simple linear regression and compare to a\nrandom forest.\n\nNote that all of the warnings about the forecast date being less than the most\nrecent update date of the data have been suppressed to avoid cluttering the\noutput.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_501df4e2fa764c3cf5a36fe30f920bb9'}\n\n```{.r .cell-code}\n# Latest snapshot of data, and forecast dates\nus_latest <- epix_as_of(us_archive, max_version = max(us_archive$versions_end))\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nk_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) {\n epi_slide(epi_df, ~ arx_forecaster(\n .x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n select(-geo_value),\n before = 120L - 1L,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(geo_value, time_value, starts_with(\"fc\")) %>%\n mutate(engine_type = engine$engine)\n}\n\n# Generate the forecasts and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = rand_forest(mode = \"regression\")\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands. \n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states, \nwe'll just display the\nresults for two states, California (CA) and Florida (FL), to get a sense of the\nmodel performance while keeping the graphic simple. \n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_cf5ed426dadcf87aa72c873f89ba401b'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not\nresult in such in overconfident predictions (overly narrow confidence bands).\nThough, in general, neither approach produces amazingly accurate forecasts. \nThis could be because\nthe behaviour is rather different across states and the effects of other notable\nfactors such as age and public health measures may be important to account for\nin such forecasting. Including such factors as well as making enhancements such\nas correcting for outliers are some improvements one could make to this simple\nmodel.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to out-perform many far more complicated models in the online Covid forecasting due to those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_b65b7b3bca55827fcf187571942492de'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- readRDS(system.file(\n \"extdata\", \"can_prov_cases.rds\",\n package = \"epipredict\", mustWork = TRUE\n))\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version))\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x,\n boost_tree(mode = \"regression\", trees = 20)\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression. \n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_749e70213871f43929436d4a578868fa'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_145622420fe9517007923111890c3146'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Pseudoprospective vs. unfaithful retrospective forecasting\n\n### Example using case data from US states \n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. That is, we can\nmake forecasts on the archive, `us_archive`, and compare those to forecasts on\n(time windows of) the latest data, `us_latest`, using the same general set-up as\nabove. For pseudoprospective forecasting, note that `us_archive` is fed into\n`epix_slide()`, while for simpler (unfaithful) retrospective forecasting,\n`us_latest` is fed into `epi_slide()`. #%% update to include percent_cli after\nthat issue is fixed?\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-ar-kweek-asof_5e36fd086b20659f6706eda7f36c8d40'}\n\n```{.r .cell-code}\nk_week_versioning <- function(ahead, version = c(\"faithful\", \"unfaithful\")) {\n version <- match.arg(version)\n if (version == \"faithful\") {\n epix_slide(\n us_archive,\n ~ arx_forecaster(\n .x, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 120 - 1,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n mutate(version = \"version faithful\") %>%\n rename(geo_value = \"fc_geo_value\")\n } else {\n k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n ahead, linear_reg()\n ) %>% mutate(version = \"not version faithful\")\n }\n}\n\n# Generate the forecasts, and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_versioning(.x, \"faithful\")) %>% list_rbind(),\n map(aheads, ~ k_week_versioning(.x, \"unfaithful\")) %>% list_rbind()\n) %>% pivot_quantiles_wider(fc_.pred_distn)\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_c6417eaf4d97855d750b9f8aeb315d67'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(x = fc_target_date, group = time_value)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = version), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n scale_fill_brewer(palette = \"Set1\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", + "markdown": "# Pseudo-prospective forecast inspection\n\n\n::: {.cell}\n\n:::\n\n\n\nA key function from the epiprocess package is `epi_slide()`, which allows the\nuser to apply a function or formula-based computation over variables in an\n`epi_df` over a running window of `n` time steps (see the following `epiprocess`\nvignette to go over the basics of the function: [\"Slide a computation over\nsignal values\"](https://cmu-delphi.github.io/epiprocess/articles/slide.html)).\nThe equivalent sliding method for an `epi_archive` object can be called by using\nthe wrapper function `epix_slide()` (refer to the following vignette for the\nbasics of the function: [\"Work with archive objects and data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)). The\nkey difference from `epi_slide()` is that it performs version-aware\ncomputations. That is, the function only uses data that would have been\navailable as of time t for that reference time.\n\nIn this vignette, we use `epi_slide()` and `epix_slide()` for backtesting our\n`arx_forecaster` on historical COVID-19 case data from the US and from Canada.\nMore precisely, we first demonstrate using `epi_slide()` to slide ARX\nforecasters over an `epi_df` object and compare the results obtained from using\ndifferent forecasting engines. We then compare these simple retrospective\nforecasts to more proper \"pseudoprospective\" forecasts generated using snapshots\nof the data that was available in real time, using `epix_slide()`.\n\n## Comparing different forecasting engines\n\n### Example using CLI and case data from US states\n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_89a9d4079f8ffc6080f83369668b2316'}\n\n```{.r .cell-code}\nus_raw_history_dfs <- readRDS(url(\n \"https://github.com/cmu-delphi/epipredict/raw/dev/vignettes/articles/all_states_covidcast_signals.rds\"\n))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nAfter obtaining the latest snapshot of the data, we produce forecasts on that\ndata using the default engine of simple linear regression and compare to a\nrandom forest.\n\nNote that all of the warnings about the forecast date being less than the most\nrecent update date of the data have been suppressed to avoid cluttering the\noutput.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_404b2b076aee47ac4ed54f2b9ba369d2'}\n\n```{.r .cell-code}\n# Latest snapshot of data, and forecast dates\nus_latest <- epix_as_of(us_archive, max_version = max(us_archive$versions_end))\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nk_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) {\n epi_slide(epi_df, ~ arx_forecaster(\n .x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n select(-geo_value),\n before = 120L - 1L,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(geo_value, time_value, starts_with(\"fc\")) %>%\n mutate(engine_type = engine$engine)\n}\n\n# Generate the forecasts and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = rand_forest(mode = \"regression\")\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands.\n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states,\nwe'll just display the\nresults for two states, California (CA) and Florida (FL), to get a sense of the\nmodel performance while keeping the graphic simple.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_c99e5bb41b438e998b3904771b96568d'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not\nresult in such in overconfident predictions (overly narrow confidence bands).\nThough, in general, neither approach produces amazingly accurate forecasts.\nThis could be because\nthe behaviour is rather different across states and the effects of other notable\nfactors such as age and public health measures may be important to account for\nin such forecasting. Including such factors as well as making enhancements such\nas correcting for outliers are some improvements one could make to this simple\nmodel.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to out-perform many far more complicated models in the online Covid forecasting due to those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_731a38443595452707f8f146a49636dd'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- epidatasets::can_prov_cases\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version))\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x,\n boost_tree(mode = \"regression\", trees = 20)\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_a609b4a2e0dd0f49e145eb8a6b3ff50e'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_c606f4fff3ff4eb8919817c6e7100441'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Pseudoprospective vs. unfaithful retrospective forecasting\n\n### Example using case data from US states\n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. That is, we can\nmake forecasts on the archive, `us_archive`, and compare those to forecasts on\n(time windows of) the latest data, `us_latest`, using the same general set-up as\nabove. For pseudoprospective forecasting, note that `us_archive` is fed into\n`epix_slide()`, while for simpler (unfaithful) retrospective forecasting,\n`us_latest` is fed into `epi_slide()`. #%% update to include percent_cli after\nthat issue is fixed?\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-ar-kweek-asof_bb2b98563c892027582a2aa804662dcc'}\n\n```{.r .cell-code}\nk_week_versioning <- function(ahead, version = c(\"faithful\", \"unfaithful\")) {\n version <- match.arg(version)\n if (version == \"faithful\") {\n epix_slide(\n us_archive,\n ~ arx_forecaster(\n .x, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 120 - 1,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n mutate(version = \"version faithful\") %>%\n rename(geo_value = \"fc_geo_value\")\n } else {\n k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n ahead, linear_reg()\n ) %>% mutate(version = \"not version faithful\")\n }\n}\n\n# Generate the forecasts, and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_versioning(.x, \"faithful\")) %>% list_rbind(),\n map(aheads, ~ k_week_versioning(.x, \"unfaithful\")) %>% list_rbind()\n) %>% pivot_quantiles_wider(fc_.pred_distn)\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_e484487727d34e88d3d51c13eeb6cdaa'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(x = fc_target_date, group = time_value)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = version), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n scale_fill_brewer(palette = \"Set1\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg index b6ff68b..7ffb5a0 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg
diff --git a/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg b/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg index 4145a41..6c1d4fc 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg @@ -1,2006 +1,2009 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg b/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg index c6d0522..2c407fd 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg @@ -1,5530 +1,5929 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + - - + + - - + + - - + + - - + + - - + + - - + + + + - - + + + + - - + + + + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + + + + + + + - - + + + + - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - + + - - + + - - + + - - + + - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - + + - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - + + - - + + - - - - - - - - - - - - - - - - + + + - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + + - - + + + + - - + + + - - + + - - + + - - + + - - + + + + + - - + + + + - - + + + + - - + + - - + + - - + + - - + + + - - + + - - + + - - + + - - + + + + + + + + - - + + - - + + + + + + - - + + - - + + - - + + - - + + + + + + + + - - + + + + + + + - - + + - - + + + + + + + + - - + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - + + - - - - + + - - - - - + + - - + + - - + + - - - - - - - - - - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg b/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg index 92e22be..f082558 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svgdiff --git a/packages.bib b/packages.bib index b4c56f8..0b7321a 100644 --- a/packages.bib +++ b/packages.bib @@ -122,7 +122,8 @@ @Manual{R-tidyverse author = {Hadley Wickham}, year = {2023}, note = {R package version 2.0.0}, - url = {https://CRAN.R-project.org/package=tidyverse}, + url = {https://CRAN.R-project.org/package=tidyverse} +} @Misc{epidatr2015, title = {Delphi Epidata API}, diff --git a/sliding-forecasters.qmd b/sliding-forecasters.qmd index 327f9e4..8aee741 100644 --- a/sliding-forecasters.qmd +++ b/sliding-forecasters.qmd @@ -29,7 +29,7 @@ of the data that was available in real time, using `epix_slide()`. ## Comparing different forecasting engines -### Example using CLI and case data from US states +### Example using CLI and case data from US states First, we download the version history (i.e. archive) of the percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance @@ -40,9 +40,9 @@ version of each observation can be carried forward to extrapolate unavailable versions for the less up-to-date input archive. ```{r grab-epi-data} -us_raw_history_dfs <- - readRDS(system.file("extdata", "all_states_covidcast_signals.rds", - package = "epipredict", mustWork = TRUE)) +us_raw_history_dfs <- readRDS(url( + "https://github.com/cmu-delphi/epipredict/raw/dev/vignettes/articles/all_states_covidcast_signals.rds" +)) us_cli_archive <- us_raw_history_dfs[[1]] %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% @@ -52,7 +52,8 @@ us_cases_archive <- us_raw_history_dfs[[2]] %>% as_epi_archive(compactify = TRUE) us_archive <- epix_merge( - us_cli_archive, us_cases_archive, sync = "locf", compactify = TRUE + us_cli_archive, us_cases_archive, + sync = "locf", compactify = TRUE ) ``` @@ -68,21 +69,22 @@ output. # Latest snapshot of data, and forecast dates us_latest <- epix_as_of(us_archive, max_version = max(us_archive$versions_end)) fc_time_values <- seq( - from = as.Date("2020-08-01"), - to = as.Date("2021-11-01"), + from = as.Date("2020-08-01"), + to = as.Date("2021-11-01"), by = "1 month" ) aheads <- c(7, 14, 21, 28) k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { epi_slide(epi_df, ~ arx_forecaster( - .x, outcome, predictors, engine, - args_list = arx_args_list(ahead = ahead))$predictions %>% - select(-geo_value), - before = 120L - 1L, - ref_time_values = fc_time_values, - new_col_name = "fc" - ) %>% + .x, outcome, predictors, engine, + args_list = arx_args_list(ahead = ahead) + )$predictions %>% + select(-geo_value), + before = 120L - 1L, + ref_time_values = fc_time_values, + new_col_name = "fc" + ) %>% select(geo_value, time_value, starts_with("fc")) %>% mutate(engine_type = engine$engine) } @@ -92,38 +94,40 @@ fc <- bind_rows( map(aheads, ~ k_week_ahead( us_latest, "case_rate", c("case_rate", "percent_cli"), .x, engine = linear_reg() - )) %>% + )) %>% list_rbind(), map(aheads, ~ k_week_ahead( us_latest, "case_rate", c("case_rate", "percent_cli"), .x, engine = rand_forest(mode = "regression") )) %>% list_rbind() -) %>% +) %>% pivot_quantiles_wider(contains("_distn")) ``` Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the target (respecting time stamps and locations) along with lags of the features (here, the response and doctors visits), estimates a forecasting model using the -specified engine, creates predictions, and non-parametric confidence bands. +specified engine, creates predictions, and non-parametric confidence bands. To see how the predictions compare, we plot them on top of the latest case -rates. Note that even though we've fitted the model on all states, +rates. Note that even though we've fitted the model on all states, we'll just display the results for two states, California (CA) and Florida (FL), to get a sense of the -model performance while keeping the graphic simple. +model performance while keeping the graphic simple. ```{r plot-arx, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} #| code-fold: true fc_cafl <- fc %>% filter(geo_value %in% c("ca", "fl")) latest_cafl <- us_latest %>% filter(geo_value %in% c("ca", "fl")) -ggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) + - geom_line(data = latest_cafl, aes(x = time_value, y = case_rate), - inherit.aes = FALSE, color = "gray50") + +ggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) + + geom_line( + data = latest_cafl, aes(x = time_value, y = case_rate), + inherit.aes = FALSE, color = "gray50" + ) + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) + - geom_line(aes(y = fc_.pred)) + + geom_line(aes(y = fc_.pred)) + geom_point(aes(y = fc_.pred), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(engine_type ~ geo_value, scales = "free") + @@ -137,7 +141,7 @@ ggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) + For the two states of interest, simple linear regression clearly performs better than random forest in terms of accuracy of the predictions and does not result in such in overconfident predictions (overly narrow confidence bands). -Though, in general, neither approach produces amazingly accurate forecasts. +Though, in general, neither approach produces amazingly accurate forecasts. This could be because the behaviour is rather different across states and the effects of other notable factors such as age and public health measures may be important to account for @@ -169,13 +173,10 @@ linear regression with those from using boosted regression trees. ```{r get-can-fc, warning = FALSE} # source("drafts/canada-case-rates.R) -can <- readRDS(system.file( - "extdata", "can_prov_cases.rds", - package = "epipredict", mustWork = TRUE -)) +can <- epidatasets::can_prov_cases can <- can %>% - group_by(version, geo_value) %>% - arrange(time_value) %>% + group_by(version, geo_value) %>% + arrange(time_value) %>% mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>% as_epi_archive(compactify = TRUE) @@ -196,46 +197,62 @@ can_fc <- bind_rows( pivot_quantiles_wider(contains("_distn")) ``` -The first figure shows the results for all of the provinces using linear regression. +The first figure shows the results for all of the provinces using linear regression. ```{r plot-can-fc-lr, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 12} #| code-fold: true -ggplot(can_fc %>% filter(engine_type == "lm"), - aes(x = fc_target_date, group = time_value)) + +ggplot( + can_fc %>% filter(engine_type == "lm"), + aes(x = fc_target_date, group = time_value) +) + coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + - geom_line(data = can_latest, aes(x = time_value, y = cr_7dav), - inherit.aes = FALSE, color = "gray50") + + geom_line( + data = can_latest, aes(x = time_value, y = cr_7dav), + inherit.aes = FALSE, color = "gray50" + ) + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), - alpha = 0.4) + - geom_line(aes(y = fc_.pred)) + geom_point(aes(y = fc_.pred), size = 0.5) + + alpha = 0.4 + ) + + geom_line(aes(y = fc_.pred)) + + geom_point(aes(y = fc_.pred), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + - facet_wrap(~ geo_value, scales = "free_y", ncol = 3) + + facet_wrap(~geo_value, scales = "free_y", ncol = 3) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + scale_y_continuous(expand = expansion(c(0, 0.05))) + - labs(title = "Using simple linear regression", x = "Date", - y = "Reported COVID-19 case rates") + - theme(legend.position = "none") + labs( + title = "Using simple linear regression", x = "Date", + y = "Reported COVID-19 case rates" + ) + + theme(legend.position = "none") ``` Compare those forecasts with a related set using Gradient Boosting. ```{r plot-can-fc-boost, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 12} #| code-fold: true -ggplot(can_fc %>% filter(engine_type == "xgboost"), - aes(x = fc_target_date, group = time_value)) + +ggplot( + can_fc %>% filter(engine_type == "xgboost"), + aes(x = fc_target_date, group = time_value) +) + coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + - geom_line(data = can_latest, aes(x = time_value, y = cr_7dav), - inherit.aes = FALSE, color = "gray50") + + geom_line( + data = can_latest, aes(x = time_value, y = cr_7dav), + inherit.aes = FALSE, color = "gray50" + ) + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), - alpha = 0.4) + - geom_line(aes(y = fc_.pred)) + geom_point(aes(y = fc_.pred), size = 0.5) + + alpha = 0.4 + ) + + geom_line(aes(y = fc_.pred)) + + geom_point(aes(y = fc_.pred), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + - facet_wrap(~ geo_value, scales = "free_y", ncol = 3) + + facet_wrap(~geo_value, scales = "free_y", ncol = 3) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + scale_y_continuous(expand = expansion(c(0, 0.05))) + - labs(title = "Using boosted regression trees", x = "Date", - y = "Reported COVID-19 case rates") + - theme(legend.position = "none") + labs( + title = "Using boosted regression trees", x = "Date", + y = "Reported COVID-19 case rates" + ) + + theme(legend.position = "none") ``` Both approaches tend to produce quite volatile forecasts (point predictions) @@ -247,7 +264,7 @@ available in epipredict. ## Pseudoprospective vs. unfaithful retrospective forecasting -### Example using case data from US states +### Example using case data from US states We will now run pseudoprospective forecasts based on properly-versioned data (that would have been available in real-time) to forecast future COVID-19 case @@ -261,22 +278,25 @@ that issue is fixed? ```{r make-ar-kweek-asof} k_week_versioning <- function(ahead, version = c("faithful", "unfaithful")) { - version = match.arg(version) + version <- match.arg(version) if (version == "faithful") { epix_slide( us_archive, ~ arx_forecaster( .x, "case_rate", c("case_rate", "percent_cli"), - args_list = arx_args_list(ahead = ahead))$predictions, - before = 120 - 1, - ref_time_values = fc_time_values, - new_col_name = "fc") %>% + args_list = arx_args_list(ahead = ahead) + )$predictions, + before = 120 - 1, + ref_time_values = fc_time_values, + new_col_name = "fc" + ) %>% mutate(version = "version faithful") %>% rename(geo_value = "fc_geo_value") } else { k_week_ahead( - us_latest, "case_rate", c("case_rate", "percent_cli"), - ahead, linear_reg()) %>% mutate(version = "not version faithful") + us_latest, "case_rate", c("case_rate", "percent_cli"), + ahead, linear_reg() + ) %>% mutate(version = "not version faithful") } } @@ -295,10 +315,13 @@ fc_cafl <- fc %>% filter(geo_value %in% c("ca", "fl")) latest_cafl <- us_latest %>% filter(geo_value %in% c("ca", "fl")) ggplot(fc_cafl, aes(x = fc_target_date, group = time_value)) + - geom_line(data = latest_cafl, aes(x = time_value, y = case_rate), - inherit.aes = FALSE, color = "gray50") + + geom_line( + data = latest_cafl, aes(x = time_value, y = case_rate), + inherit.aes = FALSE, color = "gray50" + ) + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = version), alpha = 0.4) + - geom_line(aes(y = fc_.pred)) + geom_point(aes(y = fc_.pred), size = 0.5) + + geom_line(aes(y = fc_.pred)) + + geom_point(aes(y = fc_.pred), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(version ~ geo_value, scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + From f9b3acebbe1611f4ca54b7c45d2f0907af9c0bbf Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 6 May 2024 14:32:28 -0700 Subject: [PATCH 5/8] Update archive.qmd Co-authored-by: Daniel McDonald --- archive.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archive.qmd b/archive.qmd index 0650f1b..36e24f0 100644 --- a/archive.qmd +++ b/archive.qmd @@ -55,7 +55,7 @@ print(x) ``` An `epi_archive` is an S3 class. Its primary field is a data table `DT`, which -is of class `data.table` (from the `data.table` package), and has columns +is of class `data.table` (from the `{data.table}` package), and has columns `geo_value`, `time_value`, `version`, as well as any number of additional columns. From eeebbc83a01fb4216c64c63ef6742197844b26e2 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 6 May 2024 14:32:39 -0700 Subject: [PATCH 6/8] Update epiprocess.qmd Co-authored-by: Daniel McDonald --- epiprocess.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epiprocess.qmd b/epiprocess.qmd index e2e5776..36bc312 100644 --- a/epiprocess.qmd +++ b/epiprocess.qmd @@ -39,7 +39,7 @@ The second main data structure in the package is called [`epi_archive`]. This is an S3 class containing a data table that stores the archive (version history) of some signal variables of interest. -By convention, functions in the `epiprocess` package that operate on +By convention, functions in the `{epiprocess}` package that operate on `epi_archive` objects begin with `epix` (the "x" is meant to remind you of "archive"). For example: From 2a82c3d5e9fa4d3df5334b0f1899c80cc6b4b231 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 26 Sep 2024 13:52:59 -0700 Subject: [PATCH 7/8] fix: update the book for epiprocess 0.9.0 --- _freeze/archive/execute-results/html.json | 4 +- .../archive/figure-html/unnamed-chunk-8-1.svg | 2 +- _freeze/epidf/execute-results/html.json | 4 +- .../epidf/figure-html/unnamed-chunk-11-1.svg | 941 +- .../epidf/figure-html/unnamed-chunk-13-1.svg | 1423 +- .../epidf/figure-html/unnamed-chunk-15-1.svg | 3989 +-- _freeze/outliers/execute-results/html.json | 4 +- .../figure-html/unnamed-chunk-3-1.svg | 1057 +- .../figure-html/unnamed-chunk-7-1.svg | 2282 +- .../figure-html/unnamed-chunk-7-2.svg | 2246 +- .../figure-html/unnamed-chunk-9-1.svg | 1069 +- _freeze/slide/execute-results/html.json | 4 +- .../slide/figure-html/unnamed-chunk-10-1.svg | 308 + .../slide/figure-html/unnamed-chunk-12-1.svg | 19061 ++++++++++-- .../slide/figure-html/unnamed-chunk-16-1.svg | 2860 ++ .../slide/figure-html/unnamed-chunk-8-1.svg | 24153 ++++++++-------- .../execute-results/html.json | 4 +- .../figure-html/plot-ar-asof-1.svg | 1812 +- .../figure-html/plot-arx-1.svg | 1210 +- .../figure-html/plot-can-fc-boost-1.svg | 5656 ++-- .../figure-html/plot-can-fc-lr-1.svg | 3826 ++- archive.qmd | 38 +- epidf.qmd | 107 +- outliers.qmd | 65 +- renv.lock | 612 +- renv/activate.R | 105 +- slide.qmd | 346 +- sliding-forecasters.qmd | 283 +- 28 files changed, 45664 insertions(+), 27807 deletions(-) create mode 100644 _freeze/slide/figure-html/unnamed-chunk-10-1.svg create mode 100644 _freeze/slide/figure-html/unnamed-chunk-16-1.svg diff --git a/_freeze/archive/execute-results/html.json b/_freeze/archive/execute-results/html.json index 8c57390..c74dd54 100644 --- a/_freeze/archive/execute-results/html.json +++ b/_freeze/archive/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "4abbf9d8187ca890d0c13fd0656d50e9", + "hash": "7f45dbf54b783ddcda2ce2462e5f9d69", "result": { - "markdown": "# Work with archive objects and data revisions\n\nIn addition to the `epi_df` data structure, which we have been working with all\nalong in these vignettes, the `epiprocess` package has a companion structure\ncalled `epi_archive`. In comparison to an `epi_df` object, which can be seen as\nstoring a single snapshot of a data set with the most up-to-date signal values\nas of some given time, an `epi_archive` object stores the full version history\nof a data set. Many signals of interest for epidemiological tracking are subject\nto revision (some more than others), and paying attention to data revisions can\nbe important for all sorts of downstream data analysis and modeling tasks.\n\nThis chapter walks through working with `epi_archive` objects and demonstrates\nsome of their key functionality. We'll work with a signal on the percentage of\ndoctor's visits with CLI (COVID-like illness) computed from medical insurance\nclaims, available through the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This\nsignal is subject to very heavy and regular revision; you can read more about it\non its [API documentation\npage](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`.\n\n\n\n\n\n\n## Getting data into `epi_archive` format\n\nAn `epi_archive` object can be constructed from a data frame, data table, or\ntibble, provided that it has (at least) the following columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n* `version`: the time value specifying the version for each row of measurements.\n For example, if in a given row the `version` is January 15, 2022 and\n `time_value` is January 14, 2022, then this row contains the measurements of\n the data for January 14, 2022 that were available one day later.\n\nAs we can see from the above, the data frame returned by\n`epidatr::covidcast()` has the columns required for the `epi_archive`\nformat, so we use\n`as_epi_archive()` to cast it into `epi_archive` format.[^1]\n\n[^1]: For a discussion of the removal of\nredundant version updates in `as_epi_archive` using compactify, please refer\nto the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/compactify.html).\n\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-2_39c5cbdbb56253b327ea66e6ab4e8220'}\n\n```{.r .cell-code}\nx <- archive_cases_dv_subset_dt %>%\n select(geo_value, time_value, version, percent_cli) %>%\n as_epi_archive(compactify = TRUE)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_archive\"\n```\n:::\n\n```{.r .cell-code}\nprint(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> → An `epi_archive` object, with metadata:\n#> ℹ Min/max time values: 2020-06-01 / 2021-11-30\n#> ℹ First/last version with update: 2020-06-02 / 2021-12-01\n#> ℹ Versions end: 2021-12-01\n#> ℹ A preview of the table (119316 rows x 4 columns):\n#> Key: \n#> geo_value time_value version percent_cli\n#> \n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> --- \n#> 119312: tx 2021-11-26 2021-11-29 1.858596\n#> 119313: tx 2021-11-27 2021-11-28 NA\n#> 119314: tx 2021-11-28 2021-11-29 NA\n#> 119315: tx 2021-11-29 2021-11-30 NA\n#> 119316: tx 2021-11-30 2021-12-01 NA\n```\n:::\n:::\n\n\nAn `epi_archive` is an S3 class. Its primary field is a data table `DT`, which\nis of class `data.table` (from the `data.table` package), and has columns\n`geo_value`, `time_value`, `version`, as well as any number of additional\ncolumns.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-3_99d23f4e3321a367498344c4b6282562'}\n\n```{.r .cell-code}\nclass(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"data.table\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Key: \n#> geo_value time_value version percent_cli\n#> \n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n:::\n\n\nThe variables `geo_value`, `time_value`, `version` serve as **key variables**\nfor the data table, as well as any other specified in the metadata (described\nbelow). There can only be a single row per unique combination of key variables,\nand therefore the key variables are critical for figuring out how to generate a\nsnapshot of data from the archive, as of a given version (also described below).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-4_8b3712fe1140194d1eb702521cf15238'}\n\n```{.r .cell-code}\nkey(x$DT)\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in key(x$DT): could not find function \"key\"\n```\n:::\n:::\n\n\nIn general, the last version of each observation is carried forward (LOCF) to\nfill in data between recorded versions.\n\n## Some details on metadata\n\nThe following pieces of metadata are included as fields in an `epi_archive`\nobject:\n\n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `additional_metadata`: list of additional metadata for the data archive.\n\nMetadata for an `epi_archive` object `x` can be accessed (and altered) directly,\nas in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function\n`as_epi_archive()` attempts to guess metadata fields when an `epi_archive`\nobject is instantiated, if they are not explicitly specified in the function\ncall (as it did in the case above).\n\n## Producing snapshots in `epi_df` form\n\nA key method of an `epi_archive` class is `as_of()`, which generates a snapshot\nof the archive in `epi_df` format. This represents the most up-to-date values of\nthe signal variables as of a given version. This can be accessed via\n`epix_as_of()`.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-5_d82372bbd8143517377c9afe9103cce8'}\n\n```{.r .cell-code}\nx_snapshot <- epix_as_of(x, max_version = as.Date(\"2021-06-01\"))\nclass(x_snapshot)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nx_snapshot\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,460 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2021-06-01\n#> \n#> # A tibble: 1,460 × 3\n#> geo_value time_value percent_cli\n#> * \n#> 1 ca 2020-06-01 2.75\n#> 2 ca 2020-06-02 2.57\n#> 3 ca 2020-06-03 2.48\n#> 4 ca 2020-06-04 2.41\n#> 5 ca 2020-06-05 2.57\n#> 6 ca 2020-06-06 2.63\n#> # ℹ 1,454 more rows\n```\n:::\n\n```{.r .cell-code}\nmax(x_snapshot$time_value)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-05-31\"\n```\n:::\n\n```{.r .cell-code}\nattributes(x_snapshot)$metadata$as_of\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-06-01\"\n```\n:::\n:::\n\n\nWe can see that the max time value in the `epi_df` object `x_snapshot` that was\ngenerated from the archive is May 29, 2021, even though the specified version\ndate was June 1, 2021. From this we can infer that the doctor's visits signal\nwas 2 days latent on June 1. Also, we can see that the metadata in the `epi_df`\nobject has the version date recorded in the `as_of` field.\n\nBy default, using the maximum of the `version` column in the underlying data table in an\n`epi_archive` object itself generates a snapshot of the latest values of signal\nvariables in the entire archive. The `epix_as_of()` function issues a warning in\nthis case, since updates to the current version may still come in at a later\npoint in time, due to various reasons, such as synchronization issues.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-6_ae9ceb907b24026cb708ea184ff52cc4'}\n\n```{.r .cell-code}\nx_latest <- epix_as_of(x, max_version = max(x$DT$version))\n```\n:::\n\n\nBelow, we pull several snapshots from the archive, spaced one month apart. We\noverlay the corresponding signal curves as colored lines, with the version dates\nmarked by dotted vertical lines, and draw the latest curve in black (from the\nlatest snapshot `x_latest` that the archive can provide).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-7_7b41b3bd4e404515018a4c8d6293057d'}\n\n```{.r .cell-code}\nself_max <- max(x$DT$version)\nversions <- seq(as.Date(\"2020-06-01\"), self_max - 1, by = \"1 month\")\nsnapshots <- map(\n versions,\n function(v) {\n epix_as_of(x, max_version = v) %>% mutate(version = v)\n }\n) %>%\n list_rbind() %>%\n bind_rows(x_latest %>% mutate(version = self_max)) %>%\n mutate(latest = version == self_max)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-8_8625834090bf668df1c1c2bcae527e81'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n snapshots %>% filter(!latest),\n aes(x = time_value, y = percent_cli)\n) +\n geom_line(aes(color = factor(version)), na.rm = TRUE) +\n geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_color_viridis_d(option = \"A\", end = .9) +\n labs(x = \"Date\", y = \"% of doctor's visits with CLI\") +\n theme(legend.position = \"none\") +\n geom_line(\n data = snapshots %>% filter(latest),\n aes(x = time_value, y = percent_cli),\n inherit.aes = FALSE, color = \"black\", na.rm = TRUE\n )\n```\n\n::: {.cell-output-display}\n![](archive_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see some interesting and highly nontrivial revision behavior: at some\npoints in time the provisional data snapshots grossly underestimate the latest\ncurve (look in particular at Florida close to the end of 2021), and at others\nthey overestimate it (both states towards the beginning of 2021), though not\nquite as dramatically. Modeling the revision process, which is often called\n*backfill modeling*, is an important statistical problem in it of itself.\n\n\n## Merging `epi_archive` objects\n\nNow we demonstrate how to merge two `epi_archive` objects together, e.g., so\nthat grabbing data from multiple sources as of a particular version can be\nperformed with a single `as_of` call. The `epiprocess` packages provides\n`epix_merge()` for this purpose. Below we merge the working `epi_archive` of\nversioned percentage CLI from outpatient visits to another one of versioned\nCOVID-19 case reporting data, which we fetch the from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the\nrate scale (counts per 100,000 people in the population).\n\nWhen merging archives, unless the archives have identical data release patterns,\n`NA`s can be introduced in the non-key variables for a few reasons:\n- to represent the \"value\" of an observation before its initial release (when we\n need to pair it with additional observations from the other archive that have\n been released)\n- to represent the \"value\" of an observation that has no recorded versions at\n all (in the same sort of situation)\n- if requested via `sync = \"na\"`, to represent potential update data that we do\n not yet have access to (e.g., due to encountering issues while attempting to\n download the currently available version data for one of the archives, but not\n the other).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-9_dd8b9566b93ee92760a9756afba10db6'}\n\n```{.r .cell-code}\n# This code is for illustration and doesn't run.\n# The result is saved/loaded in the (hidden) next chunk from `{epidatasets}`\ny <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_7dav_incidence_prop\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200601, 20211201),\n geo_values = \"ca,fl,ny,tx\",\n issues = epirange(20200601, 20211201)\n) %>%\n select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%\n as_epi_archive(compactify = TRUE)\n\nx <- epix_merge(x, y, sync = \"locf\", compactify = FALSE)\nprint(x)\nhead(x$DT)\n```\n:::\n\n\n## Sliding version-aware computations\n\n::: {.callout-note}\nTODO: need a simple example here.\n:::\n", + "markdown": "# Work with archive objects and data revisions\n\nIn addition to the `epi_df` data structure, which we have been working with all\nalong in these vignettes, the `epiprocess` package has a companion structure\ncalled `epi_archive`. In comparison to an `epi_df` object, which can be seen as\nstoring a single snapshot of a data set with the most up-to-date signal values\nas of some given time, an `epi_archive` object stores the full version history\nof a data set. Many signals of interest for epidemiological tracking are subject\nto revision (some more than others), and paying attention to data revisions can\nbe important for all sorts of downstream data analysis and modeling tasks.\n\nThis chapter walks through working with `epi_archive` objects and demonstrates\nsome of their key functionality. We'll work with a signal on the percentage of\ndoctor's visits with CLI (COVID-like illness) computed from medical insurance\nclaims, available through the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This\nsignal is subject to very heavy and regular revision; you can read more about it\non its [API documentation\npage](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html).\nWe'll use the offline version stored in `{epidatasets}`.\n\n\n\n\n\n## Getting data into `epi_archive` format\n\nAn `epi_archive` object can be constructed from a data frame, data table, or\ntibble, provided that it has (at least) the following columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n* `version`: the time value specifying the version for each row of measurements.\n For example, if in a given row the `version` is January 15, 2022 and\n `time_value` is January 14, 2022, then this row contains the measurements of\n the data for January 14, 2022 that were available one day later.\n\nAs we can see from the above, the data frame returned by\n`epidatr::pub_covidcast()` has the columns required for the `epi_archive`\nformat, so we use\n`as_epi_archive()` to cast it into `epi_archive` format.[^1]\n\n[^1]: For a discussion of the removal of\nredundant version updates in `as_epi_archive` using compactify, please refer\nto the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/compactify.html).\n\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-2_b11f7e2e1cfae56bec7a39b13bb5558c'}\n\n```{.r .cell-code}\nx <- archive_cases_dv_subset_dt %>%\n select(geo_value, time_value, version, percent_cli) %>%\n as_epi_archive(compactify = TRUE)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_archive\"\n```\n:::\n\n```{.r .cell-code}\nprint(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> → An `epi_archive` object, with metadata:\n#> ℹ Min/max time values: 2020-06-01 / 2021-11-30\n#> ℹ First/last version with update: 2020-06-02 / 2021-12-01\n#> ℹ Versions end: 2021-12-01\n#> ℹ A preview of the table (119316 rows x 4 columns):\n#> Key: \n#> geo_value time_value version percent_cli\n#> \n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> --- \n#> 119312: tx 2021-11-26 2021-11-29 1.858596\n#> 119313: tx 2021-11-27 2021-11-28 NA\n#> 119314: tx 2021-11-28 2021-11-29 NA\n#> 119315: tx 2021-11-29 2021-11-30 NA\n#> 119316: tx 2021-11-30 2021-12-01 NA\n```\n:::\n:::\n\n\nAn `epi_archive` is an S3 class. Its primary field is a data table `DT`, which\nis of class `data.table` (from the `{data.table}` package), and has columns\n`geo_value`, `time_value`, `version`, as well as any number of additional\ncolumns.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-3_99d23f4e3321a367498344c4b6282562'}\n\n```{.r .cell-code}\nclass(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"data.table\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nhead(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> Key: \n#> geo_value time_value version percent_cli\n#> \n#> 1: ca 2020-06-01 2020-06-02 NA\n#> 2: ca 2020-06-01 2020-06-06 2.140116\n#> 3: ca 2020-06-01 2020-06-08 2.140379\n#> 4: ca 2020-06-01 2020-06-09 2.114430\n#> 5: ca 2020-06-01 2020-06-10 2.133677\n#> 6: ca 2020-06-01 2020-06-11 2.197207\n```\n:::\n:::\n\n\nThe variables `geo_value`, `time_value`, `version` serve as **key variables**\nfor the data table, as well as any other specified in the metadata (described\nbelow). There can only be a single row per unique combination of key variables,\nand therefore the key variables are critical for figuring out how to generate a\nsnapshot of data from the archive, as of a given version (also described below).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-4_77a1a144e9c650c7e533687354a3a0de'}\n\n```{.r .cell-code}\ndata.table::key(x$DT)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"geo_value\" \"time_value\" \"version\"\n```\n:::\n:::\n\n\nIn general, the last version of each observation is carried forward (LOCF) to\nfill in data between recorded versions.\n\n## Some details on metadata\n\nThe following pieces of metadata are included as fields in an `epi_archive`\nobject:\n\n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `additional_metadata`: list of additional metadata for the data archive.\n\nMetadata for an `epi_archive` object `x` can be accessed (and altered) directly,\nas in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function\n`as_epi_archive()` attempts to guess metadata fields when an `epi_archive`\nobject is instantiated, if they are not explicitly specified in the function\ncall (as it did in the case above).\n\n## Producing snapshots in `epi_df` form\n\nA key method of an `epi_archive` class is `as_of()`, which generates a snapshot\nof the archive in `epi_df` format. This represents the most up-to-date values of\nthe signal variables as of a given version. This can be accessed via\n`epix_as_of()`.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-5_1836424cccdf3d1cf0472182f7c859f6'}\n\n```{.r .cell-code}\nx_snapshot <- epix_as_of(x, version = as.Date(\"2021-06-01\"))\nclass(x_snapshot)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nx_snapshot\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 1,460 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2021-06-01\n#> \n#> # A tibble: 1,460 × 3\n#> geo_value time_value percent_cli\n#> * \n#> 1 ca 2020-06-01 2.75\n#> 2 ca 2020-06-02 2.57\n#> 3 ca 2020-06-03 2.48\n#> 4 ca 2020-06-04 2.41\n#> 5 ca 2020-06-05 2.57\n#> 6 ca 2020-06-06 2.63\n#> # ℹ 1,454 more rows\n```\n:::\n\n```{.r .cell-code}\nmax(x_snapshot$time_value)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-05-31\"\n```\n:::\n\n```{.r .cell-code}\nattributes(x_snapshot)$metadata$as_of\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"2021-06-01\"\n```\n:::\n:::\n\n\nWe can see that the max time value in the `epi_df` object `x_snapshot` that was\ngenerated from the archive is May 29, 2021, even though the specified version\ndate was June 1, 2021. From this we can infer that the doctor's visits signal\nwas 2 days latent on June 1. Also, we can see that the metadata in the `epi_df`\nobject has the version date recorded in the `as_of` field.\n\nBy default, using the maximum of the `version` column in the underlying data table in an\n`epi_archive` object itself generates a snapshot of the latest values of signal\nvariables in the entire archive. The `epix_as_of()` function issues a warning in\nthis case, since updates to the current version may still come in at a later\npoint in time, due to various reasons, such as synchronization issues.\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-6_ce529f6da4e30a0f740715bc4b9d054c'}\n\n```{.r .cell-code}\nx_latest <- epix_as_of(x, version = max(x$DT$version))\n```\n:::\n\n\nBelow, we pull several snapshots from the archive, spaced one month apart. We\noverlay the corresponding signal curves as colored lines, with the version dates\nmarked by dotted vertical lines, and draw the latest curve in black (from the\nlatest snapshot `x_latest` that the archive can provide).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-7_58040707d267aa6c13723eb08f4789fa'}\n\n```{.r .cell-code}\nself_max <- max(x$DT$version)\nversions <- seq(as.Date(\"2020-06-01\"), self_max - 1, by = \"1 month\")\nsnapshots <- map(\n versions,\n function(v) {\n epix_as_of(x, version = v) %>% mutate(version = v)\n }\n) %>%\n list_rbind() %>%\n bind_rows(x_latest %>% mutate(version = self_max)) %>%\n mutate(latest = version == self_max)\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-8_b16b9b5e7b728035f69a078cb0fc40db'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n snapshots %>% filter(!latest),\n aes(x = time_value, y = percent_cli)\n) +\n geom_line(aes(color = factor(version)), na.rm = TRUE) +\n geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n scale_color_viridis_d(option = \"A\", end = .9) +\n labs(x = \"Date\", y = \"% of doctor's visits with CLI\") +\n theme(legend.position = \"none\") +\n geom_line(\n data = snapshots %>% filter(latest),\n aes(x = time_value, y = percent_cli),\n inherit.aes = FALSE, color = \"black\", na.rm = TRUE\n )\n```\n\n::: {.cell-output-display}\n![](archive_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nWe can see some interesting and highly nontrivial revision behavior: at some\npoints in time the provisional data snapshots grossly underestimate the latest\ncurve (look in particular at Florida close to the end of 2021), and at others\nthey overestimate it (both states towards the beginning of 2021), though not\nquite as dramatically. Modeling the revision process, which is often called\n*backfill modeling*, is an important statistical problem in it of itself.\n\n## Merging `epi_archive` objects\n\nNow we demonstrate how to merge two `epi_archive` objects together, e.g., so\nthat grabbing data from multiple sources as of a particular version can be\nperformed with a single `as_of` call. The `epiprocess` packages provides\n`epix_merge()` for this purpose. Below we merge the working `epi_archive` of\nversioned percentage CLI from outpatient visits to another one of versioned\nCOVID-19 case reporting data, which we fetch the from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the\nrate scale (counts per 100,000 people in the population).\n\nWhen merging archives, unless the archives have identical data release patterns,\n`NA`s can be introduced in the non-key variables for a few reasons:\n- to represent the \"value\" of an observation before its initial release (when we\n need to pair it with additional observations from the other archive that have\n been released)\n- to represent the \"value\" of an observation that has no recorded versions at\n all (in the same sort of situation)\n- if requested via `sync = \"na\"`, to represent potential update data that we do\n not yet have access to (e.g., due to encountering issues while attempting to\n download the currently available version data for one of the archives, but not\n the other).\n\n\n::: {.cell layout-align=\"center\" hash='archive_cache/html/unnamed-chunk-9_dd8b9566b93ee92760a9756afba10db6'}\n\n```{.r .cell-code}\n# This code is for illustration and doesn't run.\n# The result is saved/loaded in the (hidden) next chunk from `{epidatasets}`\ny <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_7dav_incidence_prop\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200601, 20211201),\n geo_values = \"ca,fl,ny,tx\",\n issues = epirange(20200601, 20211201)\n) %>%\n select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%\n as_epi_archive(compactify = TRUE)\n\nx <- epix_merge(x, y, sync = \"locf\", compactify = FALSE)\nprint(x)\nhead(x$DT)\n```\n:::\n\n\n## Sliding version-aware computations\n\n::: {.callout-note}\nTODO: need a simple example here.\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/archive/figure-html/unnamed-chunk-8-1.svg b/_freeze/archive/figure-html/unnamed-chunk-8-1.svg index 9c9e89c..81b91f1 100644 --- a/_freeze/archive/figure-html/unnamed-chunk-8-1.svg +++ b/_freeze/archive/figure-html/unnamed-chunk-8-1.svg @@ -679,7 +679,7 @@ - + diff --git a/_freeze/epidf/execute-results/html.json b/_freeze/epidf/execute-results/html.json index b23b744..b9dd9cc 100644 --- a/_freeze/epidf/execute-results/html.json +++ b/_freeze/epidf/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "ef06ff93fa44c25cdac272e9bf1aeb9b", + "hash": "eac6de9244d86d626bcdbb2945c1fe97", "result": { - "markdown": "# Getting data into epi_df format\n\n\n\n\n\nWe'll start by showing how to get data into \n`epi_df`, which is just\na tibble with a bit of special structure, and is the format assumed by all of\nthe functions in the `epiprocess` package. An `epi_df` object has (at least) the\nfollowing columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n\nIt can have any number of other columns which can serve as measured variables,\nwhich we also broadly refer to as signal variables. The documentation for\n gives more details about this data format.\n\nA data frame or tibble that has `geo_value` and `time_value` columns can be\nconverted into an `epi_df` object, using the function `as_epi_df()`. As an\nexample, we'll work with daily cumulative COVID-19 cases from four U.S. states:\nCA, FL, NY, and TX, over time span from mid 2020 to early 2022, and we'll use\nthe [`epidatr`](https://github.com/cmu-delphi/epidatr) package\nto fetch this data from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-2_a8b0ce831d237748edcef31c420862a2'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(withr)\n\ncases <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_cumulative_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200301, 20220131),\n geo_values = \"ca,fl,ny,tx\"\n)\n\ncolnames(cases)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"geo_value\" \"signal\" \"source\" \n#> [4] \"geo_type\" \"time_type\" \"time_value\" \n#> [7] \"direction\" \"issue\" \"lag\" \n#> [10] \"missing_value\" \"missing_stderr\" \"missing_sample_size\"\n#> [13] \"value\" \"stderr\" \"sample_size\"\n```\n:::\n:::\n\n\nAs we can see, a data frame returned by `epidatr::covidcast()` has the\ncolumns required for an `epi_df` object (along with many others). We can use\n`as_epi_df()`, with specification of some relevant metadata, to bring the data\nframe into `epi_df` format.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-3_634293240d733bec84dd8b6a5c74e634'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases,\n geo_type = \"state\",\n time_type = \"day\",\n as_of = max(cases$issue)\n) %>%\n select(geo_value, time_value, total_cases = value)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nsummary(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` x, with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-03-10\n#> ----------\n#> * min time value = 2020-03-01\n#> * max time value = 2022-01-31\n#> * average rows per time value = 4\n```\n:::\n\n```{.r .cell-code}\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-03-10\n#> \n#> # A tibble: 6 × 3\n#> geo_value time_value total_cases\n#> * \n#> 1 ca 2020-03-01 19\n#> 2 fl 2020-03-01 0\n#> 3 ny 2020-03-01 0\n#> 4 tx 2020-03-01 0\n#> 5 ca 2020-03-02 23\n#> 6 fl 2020-03-02 1\n```\n:::\n\n```{.r .cell-code}\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n```\n:::\n:::\n\n\n## Some details on metadata\n\nIn general, an `epi_df` object has the following fields in its metadata:\n \n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `as_of`: the time value at which the given data were available.\n\nMetadata for an `epi_df` object `x` can be accessed (and altered) via\n`attributes(x)$metadata`. The first two fields here, `geo_type` and `time_type`,\nare not currently used by any downstream functions in the `epiprocess` package,\nand serve only as useful bits of information to convey about the data set at\nhand. The last field here, `as_of`, is one of the most unique aspects of an\n`epi_df` object.\n\nIn brief, we can think of an `epi_df` object as a single snapshot of a data set\nthat contains the most up-to-date values of some signals of interest, as of the\ntime specified `as_of`. For example, if `as_of` is January 31, 2022, then the\n`epi_df` object has the most up-to-date version of the data available as of\nJanuary 31, 2022. The `epiprocess` package also provides a companion data\nstructure called `epi_archive`, which stores the full version history of a given\ndata set. See the [archive\nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for\nmore.\n\nIf any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a \ncall to `as_epi_df()`, then this function will try to infer them from the passed\nobject. Usually, `geo_type` and `time_type` can be inferred from the `geo_value`\nand `time_value` columns, respectively, but inferring the `as_of` field is not \nas easy. See the documentation for `as_epi_df()` more details.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-4_1c364218e936aa6527bd0675ab37d455'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases) %>%\n select(geo_value, time_value, total_cases = value)\n\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n```\n:::\n:::\n\n\n## Using additional key columns in `epi_df` {#sec-additional-keys}\n\nIn the following examples we will show how to create an `epi_df` with additional keys.\n\n### Converting a `tsibble` that has county code as an extra key\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-5_28361d3ac565b78677e217c86faf03cc'}\n\n```{.r .cell-code}\nset.seed(12345)\nex1 <- tibble(\n geo_value = rep(c(\"ca\", \"fl\", \"pa\"), each = 3),\n county_code = c(\n \"06059\", \"06061\", \"06067\", \"12111\", \"12113\", \"12117\",\n \"42101\", \"42103\", \"42105\"\n ),\n time_value = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"1 day\"),\n length.out = 9\n ),\n value = rpois(9, 5)\n) %>%\n as_tsibble(index = time_value, key = c(geo_value, county_code))\n\nex1 <- as_epi_df(x = ex1, geo_type = \"state\", time_type = \"day\", as_of = \"2020-06-03\")\n```\n:::\n\n\nThe metadata now includes `county_code` as an extra key.\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-6_1c760ce7c25a1f6867568618118bb7ac'}\n\n```{.r .cell-code}\nattr(ex1, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"county_code\"\n```\n:::\n:::\n\n\n\n### Dealing with misspecified column names \n\n`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-7_52307fa1e07fa21173de3e9416897483'}\n\n```{.r .cell-code}\nex2 <- data.frame(\n state = rep(c(\"ca\", \"fl\", \"pa\"), each = 3), # misnamed\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 3), # extra key\n reported_date = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"day\"),\n length.out = 9\n ), # misnamed\n value = rpois(9, 5)\n)\nex2 %>% as_epi_df()\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in `Abort()`:\n#> ! `x` must contain a `geo_value` column.\n```\n:::\n:::\n\n\nThe columns should be renamed to match `epi_df` format. \n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-8_eea2403289899a6533606cf4f555d400'}\n\n```{.r .cell-code}\nex2 <- ex2 %>%\n rename(geo_value = state, time_value = reported_date) %>%\n as_epi_df(\n geo_type = \"state\",\n as_of = \"2020-06-03\",\n additional_metadata = list(other_keys = \"pol\")\n )\n\nattr(ex2, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"pol\"\n```\n:::\n:::\n\n\n\n### Adding additional keys to an `epi_df` object\n\nIn the above examples, all the keys are added to objects prior to conversion to\n`epi_df` objects. But this can also be accomplished afterward.\nWe'll look at an included dataset and filter to a single state for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-9_fc0625e5160d2a01eb47d18c346874ed'}\n\n```{.r .cell-code}\nex3 <- jhu_csse_county_level_subset %>%\n filter(time_value > \"2021-12-01\", state_name == \"Massachusetts\") %>%\n slice_tail(n = 6)\n\nattr(ex3, \"metadata\") # geo_type is county currently\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2022-05-23 14:35:45 PDT\"\n```\n:::\n:::\n\n\nNow we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The \"state\" `geo_type` anticipates lower-case abbreviations, so we'll match that. \n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-10_fe2c6e15016b44b9220d5fc4f6b51049'}\n\n```{.r .cell-code}\nex3 <- ex3 %>%\n as_tibble() %>% # drop the `epi_df` class before adding additional metadata\n mutate(\n state = rep(tolower(\"MA\"), 6),\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 2)\n ) %>%\n as_epi_df(additional_metadata = list(other_keys = c(\"state\", \"pol\")))\n\nattr(ex3, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-12-15 04:50:49 PST\"\n#> \n#> $other_keys\n#> [1] \"state\" \"pol\"\n```\n:::\n:::\n\n\nNote that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys.\n\n\n\n## Working with `epi_df` objects downstream\n\nData in `epi_df` format should be easy to work with downstream, since it is a\nvery standard tabular data format; in the other vignettes, we'll walk through\nsome basic signal processing tasks using functions provided in the `epiprocess`\npackage. Of course, we can also write custom code for other downstream uses,\nlike plotting, which is pretty easy to do `ggplot2`.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-11_cf02eb699138d3d8365b66804d295fde'}\n\n```{.r .cell-code}\nggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +\n geom_line() +\n scale_color_brewer(palette = \"Set1\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Cumulative COVID-19 cases\", color = \"State\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-11-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFinally, we'll examine some data from other packages just to show how \nwe might get them into `epi_df` format. \nThe first is data on daily new (not cumulative) SARS \ncases in Canada in 2003, from the \n[outbreaks](https://github.com/reconverse/outbreaks) package. New cases are\nbroken into a few categories by provenance.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-12_f4dc254695766edbb2625b67c42932b7'}\n\n```{.r .cell-code}\nx <- outbreaks::sars_canada_2003 %>%\n mutate(geo_value = \"ca\") %>%\n select(geo_value, time_value = date, starts_with(\"cases\")) %>%\n as_epi_df(geo_type = \"nation\")\n\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 6 with metadata:\n#> * geo_type = nation\n#> * time_type = day\n#> * as_of = 2023-12-15 04:50:50\n#> \n#> # A tibble: 6 × 6\n#> geo_value time_value cases_travel cases_household cases_healthcare\n#> * \n#> 1 ca 2003-02-23 1 0 0\n#> 2 ca 2003-02-24 0 0 0\n#> 3 ca 2003-02-25 0 0 0\n#> 4 ca 2003-02-26 0 1 0\n#> 5 ca 2003-02-27 0 0 0\n#> 6 ca 2003-02-28 1 0 0\n#> # ℹ 1 more variable: cases_other \n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-13_af68bf6df70c76b27435c6f2822266e9'}\n\n```{.r .cell-code code-fold=\"true\"}\nx <- x %>%\n pivot_longer(starts_with(\"cases\"), names_to = \"type\") %>%\n mutate(type = substring(type, 7))\n\nggplot(x, aes(x = time_value, y = value)) +\n geom_col(aes(fill = type), just = 0.5) +\n scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"SARS cases in Canada\", fill = \"Type\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-13-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThis next example examines data on new cases of Ebola in Sierra Leone in 2014 (from the same package).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-14_09c7102254a1a233a78be842fcaf2096'}\n\n```{.r .cell-code}\nx <- outbreaks::ebola_sierraleone_2014 %>%\n mutate(\n cases = ifelse(status == \"confirmed\", 1, 0),\n province = case_when(\n district %in% c(\"Kailahun\", \"Kenema\", \"Kono\") ~ \"Eastern\",\n district %in% c(\n \"Bombali\", \"Kambia\", \"Koinadugu\",\n \"Port Loko\", \"Tonkolili\"\n ) ~ \"Northern\",\n district %in% c(\"Bo\", \"Bonthe\", \"Moyamba\", \"Pujehun\") ~ \"Sourthern\",\n district %in% c(\"Western Rural\", \"Western Urban\") ~ \"Western\"\n )\n ) %>%\n select(geo_value = province, time_value = date_of_onset, cases) %>%\n filter(cases == 1) %>%\n group_by(geo_value, time_value) %>%\n summarise(cases = sum(cases)) %>%\n as_epi_df(geo_type = \"province\")\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-15_7b787f995e155e919b8f184101e75f87'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases)) +\n geom_col(aes(fill = geo_value), show.legend = FALSE) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Confirmed cases of Ebola in Sierra Leone\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-15-1.svg){fig-align='center' width=90%}\n:::\n:::\n", + "markdown": "# Getting data into epi_df format\n\n\n\n\n\nWe'll start by showing how to get data into\n`epi_df`, which is just\na tibble with a bit of special structure, and is the format assumed by all of\nthe functions in the `epiprocess` package. An `epi_df` object has (at least) the\nfollowing columns:\n\n* `geo_value`: the geographic value associated with each row of measurements.\n* `time_value`: the time value associated with each row of measurements.\n\nIt can have any number of other columns which can serve as measured variables,\nwhich we also broadly refer to as signal variables. The documentation for\n gives more details about this data format.\n\nA data frame or tibble that has `geo_value` and `time_value` columns can be\nconverted into an `epi_df` object, using the function `as_epi_df()`. As an\nexample, we'll work with daily cumulative COVID-19 cases from four U.S. states:\nCA, FL, NY, and TX, over time span from mid 2020 to early 2022, and we'll use\nthe [`epidatr`](https://github.com/cmu-delphi/epidatr) package\nto fetch this data from the [COVIDcast\nAPI](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-2_a8b0ce831d237748edcef31c420862a2'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(withr)\n\ncases <- pub_covidcast(\n source = \"jhu-csse\",\n signals = \"confirmed_cumulative_num\",\n time_type = \"day\",\n geo_type = \"state\",\n time_values = epirange(20200301, 20220131),\n geo_values = \"ca,fl,ny,tx\"\n)\n\ncolnames(cases)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"geo_value\" \"signal\" \"source\" \n#> [4] \"geo_type\" \"time_type\" \"time_value\" \n#> [7] \"direction\" \"issue\" \"lag\" \n#> [10] \"missing_value\" \"missing_stderr\" \"missing_sample_size\"\n#> [13] \"value\" \"stderr\" \"sample_size\"\n```\n:::\n:::\n\n\nAs we can see, a data frame returned by `epidatr::pub_covidcast()` has the\ncolumns required for an `epi_df` object (along with many others). We can use\n`as_epi_df()`, with specification of some relevant metadata, to bring the data\nframe into `epi_df` format.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-3_424072c00e940fde0fbfdef1fb6c56b1'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases, as_of = max(cases$issue)) %>%\n select(geo_value, time_value, total_cases = value)\n\nclass(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> [1] \"epi_df\" \"tbl_df\" \"tbl\" \"data.frame\"\n```\n:::\n\n```{.r .cell-code}\nsummary(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` x, with metadata:\n#> * geo_type = state\n#> * as_of = 2023-03-10\n#> ----------\n#> * min time value = 2020-03-01\n#> * max time value = 2022-01-31\n#> * average rows per time value = 4\n```\n:::\n\n```{.r .cell-code}\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2023-03-10\n#> \n#> # A tibble: 6 × 3\n#> geo_value time_value total_cases\n#> * \n#> 1 ca 2020-03-01 19\n#> 2 fl 2020-03-01 0\n#> 3 ny 2020-03-01 0\n#> 4 tx 2020-03-01 0\n#> 5 ca 2020-03-02 23\n#> 6 fl 2020-03-02 1\n```\n:::\n\n```{.r .cell-code}\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n#> \n#> $other_keys\n#> character(0)\n```\n:::\n:::\n\n\n## Some details on metadata\n\nIn general, an `epi_df` object has the following fields in its metadata:\n\n* `geo_type`: the type for the geo values.\n* `time_type`: the type for the time values.\n* `as_of`: the time value at which the given data were available.\n\nMetadata for an `epi_df` object `x` can be accessed (and altered) via\n`attributes(x)$metadata`. The first two fields here, `geo_type` and `time_type`,\nare not currently used by any downstream functions in the `epiprocess` package,\nand serve only as useful bits of information to convey about the data set at\nhand. The last field here, `as_of`, is one of the most unique aspects of an\n`epi_df` object.\n\nIn brief, we can think of an `epi_df` object as a single snapshot of a data set\nthat contains the most up-to-date values of some signals of interest, as of the\ntime specified `as_of`. For example, if `as_of` is January 31, 2022, then the\n`epi_df` object has the most up-to-date version of the data available as of\nJanuary 31, 2022. The `epiprocess` package also provides a companion data\nstructure called `epi_archive`, which stores the full version history of a given\ndata set. See the [archive\nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for\nmore.\n\nIf any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a\ncall to `as_epi_df()`, then this function will try to infer them from the passed\nobject. Usually, `geo_type` and `time_type` can be inferred from the `geo_value`\nand `time_value` columns, respectively, but inferring the `as_of` field is not\nas easy. See the documentation for `as_epi_df()` more details.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-4_1c364218e936aa6527bd0675ab37d455'}\n\n```{.r .cell-code}\nx <- as_epi_df(cases) %>%\n select(geo_value, time_value, total_cases = value)\n\nattributes(x)$metadata\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2023-03-10\"\n#> \n#> $other_keys\n#> character(0)\n```\n:::\n:::\n\n\n## Using additional key columns in `epi_df` {#sec-additional-keys}\n\nIn the following examples we will show how to create an `epi_df` with additional keys.\n\n### Converting a `tsibble` that has county code as an extra key\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-5_13ed9b2e299a25ed17b6425a15addc48'}\n\n```{.r .cell-code}\nset.seed(12345)\nex1 <- tibble(\n geo_value = rep(c(\"ca\", \"fl\", \"pa\"), each = 3),\n county_code = c(\n \"06059\", \"06061\", \"06067\", \"12111\", \"12113\", \"12117\",\n \"42101\", \"42103\", \"42105\"\n ),\n time_value = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"1 day\"),\n length.out = 9\n ),\n value = rpois(9, 5)\n) %>%\n as_tsibble(index = time_value, key = c(geo_value, county_code))\n\nex1 <- as_epi_df(x = ex1, as_of = \"2020-06-03\")\n```\n:::\n\n\nThe metadata now includes `county_code` as an extra key.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-6_1c760ce7c25a1f6867568618118bb7ac'}\n\n```{.r .cell-code}\nattr(ex1, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"county_code\"\n```\n:::\n:::\n\n\n\n### Dealing with misspecified column names\n\n`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-7_ed6a7fdb22f3e76485027e927056c297'}\n\n```{.r .cell-code}\nex2 <- data.frame(\n state = rep(c(\"ca\", \"fl\", \"pa\"), each = 3), # misnamed\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 3), # extra key\n reported_date = rep(\n seq(as.Date(\"2020-06-01\"), as.Date(\"2020-06-03\"), by = \"day\"),\n length.out = 9\n ), # misnamed\n value = rpois(9, 5)\n)\nex2 %>% as_epi_df()\n```\n\n::: {.cell-output .cell-output-error}\n```\n#> Error in `guess_column_name()` at epiprocess/R/epi_df.R:233:3:\n#> ! There is no time_value column or similar name. See e.g.\n#> [`time_column_name()`] for a complete list\n```\n:::\n:::\n\n\nThe columns should be renamed to match `epi_df` format.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-8_55da8bb70c4f752a59c17e051f950ce5'}\n\n```{.r .cell-code}\nex2 <- ex2 %>%\n rename(geo_value = state, time_value = reported_date) %>%\n as_epi_df(\n as_of = \"2020-06-03\",\n other_keys = \"pol\"\n )\n\nattr(ex2, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"state\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2020-06-03\"\n#> \n#> $other_keys\n#> [1] \"pol\"\n```\n:::\n:::\n\n\n### Adding additional keys to an `epi_df` object\n\nIn the above examples, all the keys are added to objects prior to conversion to\n`epi_df` objects. But this can also be accomplished afterward.\nWe'll look at an included dataset and filter to a single state for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-9_c65c3cc6cc393f7a99385702f28b05ad'}\n\n```{.r .cell-code}\nex3 <- jhu_csse_county_level_subset %>%\n filter(time_value > \"2021-12-01\", state_name == \"Massachusetts\") %>%\n slice_tail(n = 6)\n\nattr(ex3, \"metadata\") # geo_type is county currently\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2024-08-22 19:40:32 PDT\"\n#> \n#> $other_keys\n#> character(0)\n```\n:::\n:::\n\n\nNow we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The \"state\" `geo_type` anticipates lower-case abbreviations, so we'll match that.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-10_6335a687a09e5e296e78d4e2c39b281e'}\n\n```{.r .cell-code}\nex3 <- ex3 %>%\n as_tibble() %>% # drop the `epi_df` class before adding additional metadata\n mutate(\n state = rep(tolower(\"MA\"), 6),\n pol = rep(c(\"blue\", \"swing\", \"swing\"), each = 2)\n ) %>%\n as_epi_df(other_keys = c(\"state\", \"pol\"))\n\nattr(ex3, \"metadata\")\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> $geo_type\n#> [1] \"county\"\n#> \n#> $time_type\n#> [1] \"day\"\n#> \n#> $as_of\n#> [1] \"2024-09-30 16:41:57 PDT\"\n#> \n#> $other_keys\n#> [1] \"state\" \"pol\"\n```\n:::\n:::\n\n\nNote that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys.\n\n\n\n## Working with `epi_df` objects downstream\n\nData in `epi_df` format should be easy to work with downstream, since it is a\nvery standard tabular data format; in the other vignettes, we'll walk through\nsome basic signal processing tasks using functions provided in the `epiprocess`\npackage. Of course, we can also write custom code for other downstream uses,\nlike plotting, which is pretty easy to do `ggplot2`.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-11_355de57e64e10837f1f429b4603f237d'}\n\n```{.r .cell-code}\nggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +\n geom_line() +\n scale_color_brewer(palette = \"Set1\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Cumulative COVID-19 cases\", color = \"State\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-11-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFinally, we'll examine some data from other packages just to show how\nwe might get them into `epi_df` format.\nThe first is data on daily new (not cumulative) SARS\ncases in Canada in 2003, from the\n[outbreaks](https://github.com/reconverse/outbreaks) package. New cases are\nbroken into a few categories by provenance.\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-12_45ec873c79b7f5c7b5732235ee7e34ff'}\n\n```{.r .cell-code}\nx <- outbreaks::sars_canada_2003 %>%\n mutate(geo_value = \"ca\") %>%\n select(geo_value, time_value = date, starts_with(\"cases\")) %>%\n as_epi_df()\n\nhead(x)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 6 x 6 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-09-30 16:41:57.652717\n#> \n#> # A tibble: 6 × 6\n#> geo_value time_value cases_travel cases_household cases_healthcare\n#> * \n#> 1 ca 2003-02-23 1 0 0\n#> 2 ca 2003-02-24 0 0 0\n#> 3 ca 2003-02-25 0 0 0\n#> 4 ca 2003-02-26 0 1 0\n#> 5 ca 2003-02-27 0 0 0\n#> 6 ca 2003-02-28 1 0 0\n#> # ℹ 1 more variable: cases_other \n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-13_dfc91abe2d4b1af7653920626ab2e6d8'}\n\n```{.r .cell-code code-fold=\"true\"}\nx <- x %>%\n pivot_longer(starts_with(\"cases\"), names_to = \"type\") %>%\n mutate(type = substring(type, 7))\n\nggplot(x, aes(x = time_value, y = value)) +\n geom_col(aes(fill = type), just = 0.5) +\n scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"SARS cases in Canada\", fill = \"Type\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-13-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThis next example examines data on new cases of Ebola in Sierra Leone in 2014 (from the same package).\n\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-14_ed7ca3d4b9c9eebb3ec4511a1f697a14'}\n\n```{.r .cell-code}\nx <- outbreaks::ebola_sierraleone_2014 %>%\n mutate(\n cases = ifelse(status == \"confirmed\", 1, 0),\n province = case_when(\n district %in% c(\"Kailahun\", \"Kenema\", \"Kono\") ~ \"Eastern\",\n district %in% c(\n \"Bombali\", \"Kambia\", \"Koinadugu\",\n \"Port Loko\", \"Tonkolili\"\n ) ~ \"Northern\",\n district %in% c(\"Bo\", \"Bonthe\", \"Moyamba\", \"Pujehun\") ~ \"Sourthern\",\n district %in% c(\"Western Rural\", \"Western Urban\") ~ \"Western\"\n )\n ) %>%\n select(geo_value = province, time_value = date_of_onset, cases) %>%\n filter(cases == 1) %>%\n group_by(geo_value, time_value) %>%\n summarise(cases = sum(cases)) %>%\n as_epi_df()\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='epidf_cache/html/unnamed-chunk-15_43837d6f58c4eaea8f6160ffffe59d5b'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases)) +\n geom_col(aes(fill = geo_value), show.legend = FALSE) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Confirmed cases of Ebola in Sierra Leone\")\n```\n\n::: {.cell-output-display}\n![](epidf_files/figure-html/unnamed-chunk-15-1.svg){fig-align='center' width=90%}\n:::\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/epidf/figure-html/unnamed-chunk-11-1.svg b/_freeze/epidf/figure-html/unnamed-chunk-11-1.svg index dea577d..6619e3f 100644 --- a/_freeze/epidf/figure-html/unnamed-chunk-11-1.svg +++ b/_freeze/epidf/figure-html/unnamed-chunk-11-1.svgdiff --git a/_freeze/epidf/figure-html/unnamed-chunk-13-1.svg b/_freeze/epidf/figure-html/unnamed-chunk-13-1.svg index 86623eb..0ceab63 100644 --- a/_freeze/epidf/figure-html/unnamed-chunk-13-1.svg +++ b/_freeze/epidf/figure-html/unnamed-chunk-13-1.svgdiff --git a/_freeze/epidf/figure-html/unnamed-chunk-15-1.svg b/_freeze/epidf/figure-html/unnamed-chunk-15-1.svg index c196909..7193232 100644 --- a/_freeze/epidf/figure-html/unnamed-chunk-15-1.svg +++ b/_freeze/epidf/figure-html/unnamed-chunk-15-1.svgdiff --git a/_freeze/outliers/execute-results/html.json b/_freeze/outliers/execute-results/html.json index bcf951b..f8c6338 100644 --- a/_freeze/outliers/execute-results/html.json +++ b/_freeze/outliers/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "2c87fd6b2160a0e6e82f132ff958832d", + "hash": "5bc615a2bdf00f9db22f9d3adb7a86e2", "result": { - "markdown": "# Detect and correct outliers in signals\n\nThis chapter describes functionality for detecting and correcting outliers in\nsignals in the `detect_outlr()` and `correct_outlr()` functions provided in the\n`epiprocess` package. These functions is designed to be modular and extendable,\nso that you can define your own outlier detection and correction routines and\napply them to `epi_df` objects. We'll demonstrate this using state-level daily\nreported COVID-19 case counts from FL and NJ.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-2_a04e38e37e2a0cee4145786b428621e0'}\n\n```{.r .cell-code}\nx <- incidence_num_outlier_example\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-3_eeb1c583efb1d858ceb57a9c288edf2e'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(x, aes(x = time_value, y = cases, color = geo_value)) +\n geom_line() +\n scale_color_manual(values = c(3, 6)) +\n geom_hline(yintercept = 0, linetype = 3) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-3-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThere are multiple outliers in these data that a modeler may want to detect and\ncorrect. We'll discuss those two tasks in turn.\n\n## Outlier detection\n\nThe `detect_outlr()` function allows us to run multiple outlier detection\nmethods on a given signal, and then (optionally) combine the results from those\nmethods. Here, we'll investigate outlier detection results from the following\nmethods.\n\n1. Detection based on a rolling median, using `detect_outlr_rm()`, which \n computes a rolling median on with a default window size of `n` time points \n centered at the time point under consideration, and then computes thresholds \n based on a multiplier times a rolling IQR computed on the residuals. \n2. Detection based on a seasonal-trend decomposition using LOESS (STL), using\n `detect_outlr_stl()`, which is similar to the rolling median method but \n replaces the rolling median with fitted values from STL. \n3. Detection based on an STL decomposition, but without seasonality term, which\n amounts to smoothing using LOESS.\n\nThe outlier detection methods are specified using a `tibble` that is passed to\n`detect_outlr()`, with one row per method, and whose columms specify the\noutlier detection function, any input arguments (only nondefault values need to\nbe supplied), and an abbreviated name for the method used in tracking results.\nAbbreviations \"rm\" and \"stl\" can be used for the built-in detection functions \n`detect_outlr_rm()` and `detect_outlr_stl()`, respectively.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-4_d718f9b3ce1f2b62cbbefda0c73956dc'}\n\n```{.r .cell-code}\ndetection_methods <- bind_rows(\n tibble(\n method = \"rm\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5\n )),\n abbr = \"rm\"\n ),\n tibble(\n method = \"stl\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5,\n seasonal_period = 7\n )),\n abbr = \"stl_seasonal\"\n ),\n tibble(\n method = \"stl\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5,\n seasonal_period = NULL\n )),\n abbr = \"stl_nonseasonal\"\n )\n)\n\ndetection_methods\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 3 × 3\n#> method args abbr \n#> \n#> 1 rm rm \n#> 2 stl stl_seasonal \n#> 3 stl stl_nonseasonal\n```\n:::\n:::\n\n\nAdditionally, we'll form combined lower and upper thresholds, calculated as the\nmedian of the lower and upper thresholds from the methods at each time point.\nNote that using this combined median threshold is equivalent to using a majority\nvote across the base methods to determine whether a value is an outlier.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-5_8b0c1909c0789a5ed4ad41dc03bdbcc0'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n mutate(\n outlier_info = detect_outlr(\n x = time_value, y = cases,\n methods = detection_methods,\n combiner = \"median\"\n )\n ) %>%\n ungroup() %>%\n unnest(outlier_info)\n\nx\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 730 x 15 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-21 15:17:14\n#> \n#> # A tibble: 730 × 15\n#> geo_value time_value cases rm_lower rm_upper rm_replacement\n#> * \n#> 1 fl 2020-06-01 667 345 2195 667\n#> 2 nj 2020-06-01 486 64.4 926. 486\n#> 3 fl 2020-06-02 617 406. 2169. 617\n#> 4 nj 2020-06-02 658 140. 841. 658\n#> 5 fl 2020-06-03 1317 468. 2142. 1317\n#> 6 nj 2020-06-03 541 216 756 541\n#> # ℹ 724 more rows\n#> # ℹ 9 more variables: stl_seasonal_lower , stl_seasonal_upper , …\n```\n:::\n:::\n\n\nTo visualize the results, we define a convenience function for and call it on \neach state separately (hidden below the fold).\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-6_b18b51621bf6de1276da27cedba0106c'}\n\n```{.r .cell-code code-fold=\"true\"}\n# Plot outlier detection bands and/or points identified as outliers\nplot_outlr <- function(\n x, signal, method_abbr, bands = TRUE, points = TRUE,\n facet_vars = vars(geo_value), nrow = NULL, ncol = NULL,\n scales = \"fixed\") {\n # Convert outlier detection results to long format\n signal <- rlang::enquo(signal)\n x_long <- x %>%\n pivot_longer(\n cols = starts_with(method_abbr),\n names_to = c(\"method\", \".value\"),\n names_pattern = \"(.+)_(.+)\"\n )\n\n # Start of plot with observed data\n p <- ggplot() +\n geom_line(data = x, mapping = aes(x = time_value, y = !!signal))\n\n # If requested, add bands\n if (bands) {\n p <- p + geom_ribbon(\n data = x_long,\n aes(\n x = time_value, ymin = lower, ymax = upper,\n color = method\n ), fill = NA\n )\n }\n\n # If requested, add points\n if (points) {\n x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper))\n p <- p + geom_point(\n data = x_detected,\n aes(\n x = time_value, y = !!signal, color = method,\n shape = method\n )\n )\n }\n\n # If requested, add faceting\n if (!is.null(facet_vars)) {\n p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales)\n }\n\n return(p)\n}\n```\n:::\n\n\nNow we produce plots for each state at a time, faceting by the detection method.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-7_5405f07bf3cd51cfe2ca67bccef65fe0'}\n\n```{.r .cell-code code-fold=\"true\"}\nmethod_abbr <- c(detection_methods$abbr, \"combined\")\n\nplot_outlr(x %>% filter(geo_value == \"fl\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"Florida\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-1.svg){fig-align='center' width=90%}\n:::\n\n```{.r .cell-code code-fold=\"true\"}\nplot_outlr(x %>% filter(geo_value == \"nj\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"New Jersey\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-2.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Outlier correction\n\nFinally, in order to correct outliers, we can use the posited replacement values\nreturned by each outlier detection method. Below we use the replacement value\nfrom the combined method, which is defined by the median of replacement values \nfrom the base methods at each time point.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-8_747bff7bae49f5f0304632fd3b1558a9'}\n\n```{.r .cell-code}\ny <- x %>%\n mutate(cases_corrected = combined_replacement) %>%\n select(geo_value, time_value, cases, cases_corrected)\n\ny %>% filter(cases != cases_corrected)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 22 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-21 15:17:14\n#> \n#> # A tibble: 22 × 4\n#> geo_value time_value cases cases_corrected\n#> * \n#> 1 fl 2020-07-12 15300 10181 \n#> 2 nj 2020-07-19 -8 320.\n#> 3 nj 2020-08-13 694 404.\n#> 4 nj 2020-08-14 619 397.\n#> 5 nj 2020-08-16 40 366 \n#> 6 nj 2020-08-22 555 360 \n#> # ℹ 16 more rows\n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-9_d88f9e8692dcd1d4f7b70de883a83a80'}\n\n```{.r .cell-code code-fold=\"true\"}\ny %>%\n pivot_longer(starts_with(\"cases\")) %>%\n ggplot(aes(x = time_value)) +\n geom_line(aes(y = value, color = name, linetype = name)) +\n scale_color_brewer(palette = \"Set1\") +\n scale_linetype_manual(values = c(2, 1)) +\n geom_hline(yintercept = 0) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\") +\n theme(legend.position = \"bottom\", legend.title = element_blank())\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nMore advanced correction functionality will be coming at some point in the \nfuture. \n\n", + "markdown": "# Detect and correct outliers in signals\n\nThis chapter describes functionality for detecting and correcting outliers in\nsignals in the `detect_outlr()` and `correct_outlr()` functions provided in the\n`epiprocess` package. These functions is designed to be modular and extendable,\nso that you can define your own outlier detection and correction routines and\napply them to `epi_df` objects. We'll demonstrate this using state-level daily\nreported COVID-19 case counts from FL and NJ.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-2_fce8088b6b75a2935c681ab7585edf92'}\n\n```{.r .cell-code}\nincidence_num_outlier_example\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 730 x 3 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:18.860591\n#> \n#> # A tibble: 730 × 3\n#> geo_value time_value cases\n#> * \n#> 1 fl 2020-06-01 667\n#> 2 nj 2020-06-01 486\n#> 3 fl 2020-06-02 617\n#> 4 nj 2020-06-02 658\n#> 5 fl 2020-06-03 1317\n#> 6 nj 2020-06-03 541\n#> # ℹ 724 more rows\n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-3_c1719525e7d4141c953ea1f5919394b6'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(incidence_num_outlier_example, aes(x = time_value, y = cases, color = geo_value)) +\n geom_line() +\n scale_color_manual(values = c(3, 6)) +\n geom_hline(yintercept = 0, linetype = 3) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-3-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nThere are multiple outliers in these data that a modeler may want to detect and\ncorrect. We'll discuss those two tasks in turn.\n\n## Outlier detection\n\nThe `detect_outlr()` function allows us to run multiple outlier detection\nmethods on a given signal, and then (optionally) combine the results from those\nmethods. Here, we'll investigate outlier detection results from the following\nmethods.\n\n1. Detection based on a rolling median, using `detect_outlr_rm()`, which\n computes a rolling median on with a default window size of `n` time points\n centered at the time point under consideration, and then computes thresholds\n based on a multiplier times a rolling IQR computed on the residuals.\n2. Detection based on a seasonal-trend decomposition using LOESS (STL), using\n `detect_outlr_stl()`, which is similar to the rolling median method but\n replaces the rolling median with fitted values from STL.\n3. Detection based on an STL decomposition, but without seasonality term, which\n amounts to smoothing using LOESS.\n\nThe outlier detection methods are specified using a `tibble` that is passed to\n`detect_outlr()`, with one row per method, and whose columms specify the\noutlier detection function, any input arguments (only nondefault values need to\nbe supplied), and an abbreviated name for the method used in tracking results.\nAbbreviations \"rm\" and \"stl\" can be used for the built-in detection functions\n`detect_outlr_rm()` and `detect_outlr_stl()`, respectively.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-4_04ab87a0055f799565c3334dbeab9ff6'}\n\n```{.r .cell-code}\ndetection_methods <- bind_rows(\n tibble(\n method = \"rm\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5\n )),\n abbr = \"rm\"\n ),\n tibble(\n method = \"stl\",\n args = list(list(\n detect_negatives = TRUE,\n detection_multiplier = 2.5,\n seasonal_period = 7\n )),\n abbr = \"stl_seasonal\"\n )\n)\n\ndetection_methods\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 2 × 3\n#> method args abbr \n#> \n#> 1 rm rm \n#> 2 stl stl_seasonal\n```\n:::\n:::\n\n\nAdditionally, we'll form combined lower and upper thresholds, calculated as the\nmedian of the lower and upper thresholds from the methods at each time point.\nNote that using this combined median threshold is equivalent to using a majority\nvote across the base methods to determine whether a value is an outlier.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-5_ad8a07e4e67cba15a5f0d0cff3d14cb7'}\n\n```{.r .cell-code}\nx <- incidence_num_outlier_example %>%\n group_by(geo_value) %>%\n mutate(\n outlier_info = detect_outlr(\n x = time_value, y = cases,\n methods = detection_methods,\n combiner = \"median\"\n )\n ) %>%\n unpack(outlier_info) %>%\n ungroup()\n\nx\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 730 × 14\n#> geo_value time_value cases rm_geo_value rm_lower rm_upper rm_replacement\n#> \n#> 1 fl 2020-06-01 667 0 345 2195 667\n#> 2 nj 2020-06-01 486 0 64.4 926. 486\n#> 3 fl 2020-06-02 617 0 406. 2169. 617\n#> 4 nj 2020-06-02 658 0 140. 841. 658\n#> 5 fl 2020-06-03 1317 0 468. 2142. 1317\n#> 6 nj 2020-06-03 541 0 216 756 541\n#> # ℹ 724 more rows\n#> # ℹ 7 more variables: stl_seasonal_geo_value , …\n```\n:::\n:::\n\n\nTo visualize the results, we define a convenience function for and call it on\neach state separately (hidden below the fold).\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-6_a45482fb876b98553f0f4e0046624c6a'}\n\n```{.r .cell-code code-fold=\"true\"}\n# Plot outlier detection bands and/or points identified as outliers\nplot_outlr <- function(\n x, signal, method_abbr, bands = TRUE, points = TRUE,\n facet_vars = vars(geo_value), nrow = NULL, ncol = NULL,\n scales = \"fixed\") {\n # Convert outlier detection results to long format\n signal <- rlang::enquo(signal)\n x_long <- x %>%\n pivot_longer(\n cols = starts_with(method_abbr),\n names_to = c(\"method\", \".value\"),\n names_pattern = \"(.+)_(.+)\"\n )\n\n # Start of plot with observed data\n p <- ggplot() +\n geom_line(data = x, mapping = aes(x = time_value, y = !!signal))\n\n # If requested, add bands\n if (bands) {\n p <- p + geom_ribbon(\n data = x_long,\n aes(\n x = time_value, ymin = lower, ymax = upper,\n color = method\n ), fill = NA\n )\n }\n\n # If requested, add points\n if (points) {\n x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper))\n p <- p + geom_point(\n data = x_detected,\n aes(\n x = time_value, y = !!signal, color = method,\n shape = method\n )\n )\n }\n\n # If requested, add faceting\n if (!is.null(facet_vars)) {\n p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales)\n }\n\n return(p)\n}\n```\n:::\n\n\nNow we produce plots for each state at a time, faceting by the detection method.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-7_6e64854f9f9a422f95e2e6614fec9c3b'}\n\n```{.r .cell-code code-fold=\"true\"}\nmethod_abbr <- c(detection_methods$abbr, \"combined\")\n\nplot_outlr(x %>% filter(geo_value == \"fl\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"Florida\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-1.svg){fig-align='center' width=90%}\n:::\n\n```{.r .cell-code code-fold=\"true\"}\nplot_outlr(x %>% filter(geo_value == \"nj\"), cases, method_abbr,\n facet_vars = vars(method), scales = \"free_y\", ncol = 2\n) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(\n x = \"Date\", y = \"Reported COVID-19 counts\", color = \"Method\",\n shape = \"Method\"\n ) +\n scale_color_brewer(palette = \"Set1\") +\n ggtitle(\"New Jersey\") +\n theme(legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-7-2.svg){fig-align='center' width=90%}\n:::\n:::\n\n\n## Outlier correction\n\nFinally, in order to correct outliers, we can use the posited replacement values\nreturned by each outlier detection method. Below we use the replacement value\nfrom the combined method, which is defined by the median of replacement values\nfrom the base methods at each time point.\n\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-8_dbd71109e1d8e19afa0883a5b3087f24'}\n\n```{.r .cell-code}\ny <- x %>%\n mutate(cases_corrected = combined_replacement) %>%\n select(geo_value, time_value, cases, cases_corrected)\n\ny %>% filter(cases != cases_corrected)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 34 × 4\n#> geo_value time_value cases cases_corrected\n#> \n#> 1 fl 2020-07-12 15300 10181 \n#> 2 nj 2020-07-19 -8 249.\n#> 3 nj 2020-07-31 748 405 \n#> 4 fl 2020-08-12 8109 5803.\n#> 5 nj 2020-08-13 694 381 \n#> 6 nj 2020-08-14 619 381 \n#> # ℹ 28 more rows\n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='outliers_cache/html/unnamed-chunk-9_d88f9e8692dcd1d4f7b70de883a83a80'}\n\n```{.r .cell-code code-fold=\"true\"}\ny %>%\n pivot_longer(starts_with(\"cases\")) %>%\n ggplot(aes(x = time_value)) +\n geom_line(aes(y = value, color = name, linetype = name)) +\n scale_color_brewer(palette = \"Set1\") +\n scale_linetype_manual(values = c(2, 1)) +\n geom_hline(yintercept = 0) +\n facet_wrap(vars(geo_value), scales = \"free_y\", ncol = 1) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 counts\") +\n theme(legend.position = \"bottom\", legend.title = element_blank())\n```\n\n::: {.cell-output-display}\n![](outliers_files/figure-html/unnamed-chunk-9-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nMore advanced correction functionality will be coming at some point in the\nfuture.\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/outliers/figure-html/unnamed-chunk-3-1.svg b/_freeze/outliers/figure-html/unnamed-chunk-3-1.svg index c6617b1..be28060 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-3-1.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-3-1.svg @@ -1,606 +1,613 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - + + - - - - + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/_freeze/outliers/figure-html/unnamed-chunk-7-1.svg b/_freeze/outliers/figure-html/unnamed-chunk-7-1.svg index 32ce016..e805ef5 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-7-1.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-7-1.svgdiff --git a/_freeze/outliers/figure-html/unnamed-chunk-7-2.svg b/_freeze/outliers/figure-html/unnamed-chunk-7-2.svg index d2e3de1..ed9e9a6 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-7-2.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-7-2.svgdiff --git a/_freeze/outliers/figure-html/unnamed-chunk-9-1.svg b/_freeze/outliers/figure-html/unnamed-chunk-9-1.svg index c42bd19..1d4053a 100644 --- a/_freeze/outliers/figure-html/unnamed-chunk-9-1.svg +++ b/_freeze/outliers/figure-html/unnamed-chunk-9-1.svgdiff --git a/_freeze/slide/execute-results/html.json b/_freeze/slide/execute-results/html.json index 70b9578..cef2d48 100644 --- a/_freeze/slide/execute-results/html.json +++ b/_freeze/slide/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "74d3db577831e2a6d86c6a6c2939a7d3", + "hash": "1c6c5e6e0f48d1e2eba243ce492d70c0", "result": { - "markdown": "# Sliding computations {#sec-sliding}\n\nA central tool in the `{epiprocess}` package is `epi_slide()`, which is based\non the powerful functionality provided in the \n[`slider`](https://cran.r-project.org/web/packages/slider) package. In\n`{epiprocess}`, to \"slide\" means to apply a computation---represented as a\nfunction or formula---over a sliding/rolling data window. Suitable\ngroupings can always be achieved by a preliminary call to `group_by()`.\n\nBy default, the meaning of one time step is inferred from the `time_value`\ncolumn of the `epi_df` object under consideration, based on the way this column\nunderstands addition and subtraction. For example, if the time values are coded\nas `Date` objects, then one time step is one day, since \n`as.Date(\"2022-01-01\") + 1` equals `as.Date(\"2022-01-02\")`. Alternatively, the time step can be specified\nmanually in the call to `epi_slide()`; you can read the documentation for more\ndetails. Furthermore, the alignment of the running window used in `epi_slide()`\ncan be \"right\", \"center\", or \"left\"; the default is \"right\", and is what we use\nin this vignette.\n\nAs in getting started guide, we'll fetch daily reported COVID-19 cases from CA,\nFL, NY, and TX (note: here we're using new, not cumulative cases) using the\n[`epidatr`](https://github.com/cmu-delphi/epidatr) package,\nand then convert this to `epi_df` format.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-2_feb3ab09af2a656b7552aabd4fb92768'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(epipredict)\n```\n:::\n\n\nThe example data we'll use is part of the package and has 2,684 rows and 3 columns.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-3_de5ebab547ecc5d1e32e4f6b65aac60b'}\n\n```{.r .cell-code}\ndata(jhu_csse_daily_subset)\nx <- jhu_csse_daily_subset %>%\n select(geo_value, time_value, cases) %>%\n arrange(geo_value, time_value) %>%\n as_epi_df()\n```\n:::\n\n\n\n## Slide with a formula\n\nWe first demonstrate how to apply a 7-day trailing average to the daily cases in\norder to smooth the signal, by passing in a formula for the first argument of\n`epi_slide()`. To do this computation per state, we first call `group_by()`.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-4_13b28969f566d77bd0c5e1e88a551491'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(~ mean(.x$cases), before = 6) %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases slide_value\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\nThe formula specified has access to all non-grouping columns present in the\noriginal `epi_df` object (and must refer to them with the prefix `.x$`). As we\ncan see, the function `epi_slide()` returns an `epi_df` object with a new column\nappended that contains the results (from sliding), named `slide_value` as the\ndefault. We can of course change this post hoc, or we can instead specify a new\nname up front using the `new_col_name` argument:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-5_cf02a2675d6bbdf3eb316e16406a82e5'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(~ mean(.x$cases), before = 6, new_col_name = \"cases_7dav\") %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases cases_7dav\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\nSome other information is available in additional variables:\n\n* `.group_key` is a one-row tibble containing the values of the grouping\n variables for the associated group\n* `.ref_time_value` is the reference time value the time window was based on\n\nLike in `group_modify()`, there are alternative names for these variables as\nwell: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z`\ninstead of `.ref_time_value`.\n\n## Slide with a function \n\nWe can also pass a function for the first argument in `epi_slide()`. In this\ncase, the passed function must accept the following arguments:\n\nIn this case, the passed function `f` must accept the following arguments: a\ndata frame with the same column names as the original object, minus any grouping\nvariables, containing the time window data for one group-`ref_time_value`\ncombination; followed by a one-row tibble containing the values of the grouping\nvariables for the associated group; followed by the associated `ref_time_value`.\nIt can accept additional arguments; `epi_slide()` will forward any `...` args it\nreceives to `f`.\n\nRecreating the last example of a 7-day trailing average:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-6_63c4174606b3c7249ee9ddd5f3171d78'}\n\n```{.r .cell-code}\nx %>%\n group_by(geo_value) %>%\n epi_slide(function(x, gk, rtv) mean(x$cases),\n before = 6, new_col_name = \"cases_7dav\"\n ) %>%\n ungroup()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 4,026 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2022-05-23 13:17:07\n#> \n#> # A tibble: 4,026 × 4\n#> geo_value time_value cases cases_7dav\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4,020 more rows\n```\n:::\n:::\n\n\n## Slide the tidy way\n\nPerhaps the most convenient way to setup a computation in `epi_slide()` is to\npass in an expression for tidy evaluation. In this case, we can simply define\nthe name of the new column directly as part of the expression, setting it equal\nto a computation in which we can access any columns of `x` by name, just as we\nwould in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-7_86937bdc4f9b436be5721bf89cb48542'}\n\n```{.r .cell-code}\nx <- x %>%\n group_by(geo_value) %>%\n epi_slide(cases_7dav = mean(cases), before = 6) %>%\n ungroup()\n```\n:::\n\nIn addition to referring to individual columns by name, you can refer to the\ntime window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse \"pronouns\" `.data` and `.env` can also be used.\n\nAs a simple sanity check, we visualize the 7-day trailing averages computed on\ntop of the original counts.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-8_4be7d7ffd8b84de93dbeff6c68bf1113'}\n\n```{.r .cell-code code-fold=\"true\"}\ncols <- RColorBrewer::brewer.pal(7, \"Set1\")[-6]\nggplot(x, aes(x = time_value)) +\n geom_col(aes(y = cases, fill = geo_value),\n alpha = 0.5,\n show.legend = FALSE\n ) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) +\n scale_fill_manual(values = cols) +\n scale_color_manual(values = cols) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-8-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAs we can see from the center top panel, it looks like Florida moved to weekly \nreporting of COVID-19 cases in summer of 2021, while California occasionally reported negative cases counts!\n\n## Running a local forecaster {#sec-local-forecaster}\n\nAs a more complex example, we preview some of the functionality of `{epipredict}` described in future chapters, and use a forecaster based on a\nlocal (in time)\nautoregression or \"AR model\". AR models can be fit in numerous ways \n(using base R\nfunctions and various packages), but here we the `arx_forecaster()`, implemented in `{epipredict}` both\nprovides a more advanced example of sliding a function over an `epi_df` object,\nand it allows us to be a bit more flexible in defining a *probabilistic*\nforecaster: one that outputs not just a point prediction, but a notion of\nuncertainty around this. In particular, our forecaster will output a point\nprediction along with an 90\\% uncertainty band, represented by a predictive\nquantiles at the 5\\% and 95\\% levels (lower and upper endpoints of the\nuncertainty band).\n\nThe function signature below, is a probabilistic AR forecaster. The\n`lags` argument indicates which lags to use in the model, and `ahead` indicates\nhow far ahead in the future to make forecasts (both are encoded in terms of the\nunits of the `time_value` column; so, days, in the working `epi_df` being\nconsidered in this vignette).\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-9_079e5420d9e5d2f5501eb74de8b45cb6'}\n\n```{.r .cell-code}\narx_forecaster <- function(\n epi_df, \n outcome, # the outcome column name in `epi_df`\n predictors, # a character vector, containing 1 or more predictors in `epi_df`\n trainer = quantile_reg(), \n args_list = arx_args_list(\n lags = c(0, 7, 14), \n ahead = 7,\n quantile_levels = c(0.05, 0.95)\n )\n)\n```\n:::\n\n\nWe go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 \ncases. Note that we actually model the `cases_7dav` column, to operate on the \nscale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant,\nto modeling weekly sums of COVID-19 cases.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-10_9e2bba94dc13dd185ad30b365f0a4eb4'}\n\n```{.r .cell-code}\nfc_time_values <- seq(\n from = as.Date(\"2020-06-01\"),\n to = as.Date(\"2021-12-01\"),\n by = \"1 months\"\n)\n\nfcasts <- epi_slide(\n x,\n ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = 7)\n )$predictions,\n before = 119,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n)\n\n# grab just the relevant columns, and make them easier to plot\nfcasts <- fcasts %>%\n select(\n geo_value, time_value, cases_7dav,\n contains(\"_distn\"), fc_target_date\n ) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\nfcasts\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 114 × 7\n#> geo_value time_value cases_7dav fc_target_date `0.05` `0.5` `0.95`\n#> \n#> 1 ca 2020-06-01 2655. 2020-06-08 1940. 2694. 3840.\n#> 2 fl 2020-06-01 726. 2020-06-08 558. 747. 1290.\n#> 3 ga 2020-06-01 643. 2020-06-08 520. 638. 1083.\n#> 4 ny 2020-06-01 1278. 2020-06-08 821. 1044. 1864.\n#> 5 pa 2020-06-01 603. 2020-06-08 450. 570. 1080.\n#> 6 tx 2020-06-01 1002. 2020-06-08 716. 1134. 1950.\n#> # ℹ 108 more rows\n```\n:::\n:::\n\n\nNote that here we have used an argument `ref_time_values` to perform the\nsliding computation (here, compute a forecast) at a specific subset of reference\ntime values. We get out 4 new columns: `fc_target_date`, `0.05`, `0.5`, `0.95`\nthat correspond to the date the forecast is for (rather than the date it was made on, the point forecast, and the lower and upper endpoints of the\n95\\% prediction band.[^1]\n\n[^1]: If instead we had set `as_list_col = TRUE`\nin the call to `epi_slide()`, then we would have gotten a list column `fc`, \nwhere each element of `fc` contains these results.\n\nTo finish off, we plot the forecasts at some times (spaced out by a few months)\nover the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do \nso, we encapsulate the process of generating forecasts into a simple function, \nso that we can call it a few times.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-11_d30fdf1ff99b2e470d215f81656d5b01'}\n\n```{.r .cell-code}\nk_week_ahead <- function(ahead = 7) {\n epi_slide(\n x,\n ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 119,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(\n geo_value, time_value, cases_7dav, contains(\"_distn\"),\n fc_target_date\n ) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n}\n\n# First generate the forecasts, and bind them together\nz <- map(c(7, 14, 21, 28), k_week_ahead) %>% list_rbind()\n```\n:::\n\n\nThen we can plot the on top of the observed data\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-12_f17b1e21df0fa2849ed240533f7e168f'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(z) +\n geom_line(data = x, aes(x = time_value, y = cases_7dav), color = \"gray50\") +\n geom_ribbon(aes(\n x = fc_target_date, ymin = `0.05`, ymax = `0.95`,\n group = time_value, fill = geo_value\n ), alpha = 0.4) +\n geom_line(aes(x = fc_target_date, y = `0.5`, group = time_value)) +\n geom_point(aes(x = fc_target_date, y = `0.5`, group = time_value), size = 0.5) +\n # geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x),\n # linetype = 2, alpha = 0.5) +\n facet_wrap(vars(geo_value), scales = \"free_y\", nrow = 3) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"1 months\", date_labels = \"%b %Y\") +\n scale_fill_viridis_d(guide = \"none\", end = .9) +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-12-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nTwo points are worth making. First, the AR model's performance here is pretty\nspotty. At various points in time, we can see that its forecasts are volatile\n(its point predictions are all over the place), or overconfident (its bands are\ntoo narrow), or both at the same time. This is only meant as a simple demo and\nnot entirely unexpected given the way the AR model is set up. The\n[`epipredict`](https://cmu-delphi.github.io/epipredict) package, \noffers a suite of predictive modeling tools \nthat improve on many of the shortcomings of the above simple AR model (simply \nusing all states for training rather than 6 is a huge improvement).\n\nSecond, the AR forecaster here is using finalized data, meaning, it uses the\nlatest versions of signal values (reported COVID-19 cases) available, for both\ntraining models and making predictions historically. However, this is not\nreflective of the provisional nature of the data that it must cope with in a\ntrue forecast task. Training and making predictions on finalized data can lead\nto an overly optimistic sense of accuracy; see, for example, \n[@McDonaldBien2021] and references\ntherein. Fortunately, the `epiprocess` package provides a data structure called\n`epi_archive` that can be used to store all data revisions, and furthermore, an\n`epi_archive` object knows how to slide computations in the correct\nversion-aware sense (for the computation at each reference time $t$, it uses\nonly data that would have been available as of $t$). We will revisit this \nexample in the [archive \nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html).\n", + "markdown": "# Sliding computations {#sec-sliding}\n\nA central tool in the `{epiprocess}` package is `epi_slide()`, which is based on\nthe powerful functionality provided in the\n[`slider`](https://cran.r-project.org/web/packages/slider) package. In\n`epiprocess`, to \"slide\" means to apply a computation---represented as a\nfunction or formula---over a sliding/rolling data window. The function always\napplies the slide inside each group and the grouping is assumed to be across all\ngroup keys of the `epi_df` (this is the grouping used by default if you do not\ngroup the `epi_df` with a `group_by()`).\n\nBy default, the `.window_size` units depend on the `time_type` of the `epi_df`,\nwhich is determined from the types in the `time_value` column of the `epi_df`.\nSee the \"Details\" in `epi_slide()` for more.\n\nAs in getting started guide, we'll fetch daily reported COVID-19 cases from CA,\nFL, NY, and TX (note: here we're using new, not cumulative cases) using the\n[`epidatr`](https://github.com/cmu-delphi/epidatr) package, and then convert\nthis to `epi_df` format.\n\n\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-2_feb3ab09af2a656b7552aabd4fb92768'}\n\n```{.r .cell-code}\nlibrary(epidatr)\nlibrary(epiprocess)\nlibrary(epipredict)\n```\n:::\n\n\nThe example data we'll use is part of the package and has 2,684 rows and 3 columns.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-3_b89fbf7446d0088fa754a437bf6f03e8'}\n\n```{.r .cell-code}\ndata(jhu_csse_daily_subset)\nedf <- jhu_csse_daily_subset %>%\n select(geo_value, time_value, cases) %>%\n arrange(geo_value, time_value) %>%\n as_epi_df()\n```\n:::\n\n\n## Optimized rolling mean and sums\n\nFor the two most common sliding operations, we offer two optimized versions:\n`epi_slide_mean()` and `epi_slide_sum()`. This example gets the 7-day trailing\naverage of the daily cases. Note that the name of the column(s) that we want to\naverage is specified as the first argument of `epi_slide_mean()`.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-4_a2804c3f9814a21c5786987a3f2838e3'}\n\n```{.r .cell-code}\nedf %>%\n group_by(geo_value) %>%\n epi_slide_mean(\"cases\", .window_size = 7, na.rm = TRUE) %>%\n ungroup() %>%\n head(10)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 10 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:48.296938\n#> \n#> # A tibble: 10 × 4\n#> geo_value time_value cases slide_value_cases\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4 more rows\n```\n:::\n:::\n\n\nNote that we passed `na.rm = TRUE` to `data.table::frollmean()` via `...` to\n`epi_slide_mean`.\n\nThe following computes the 7-day trailing sum of daily cases (and passed `na.rm`\nto `data.table::frollsum()` similarly):\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-5_5f3bbe76c94ac276d887fce7ad154af5'}\n\n```{.r .cell-code}\nedf %>%\n group_by(geo_value) %>%\n epi_slide_sum(\"cases\", .window_size = 7, na.rm = TRUE) %>%\n ungroup() %>%\n head(10)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 10 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:48.296938\n#> \n#> # A tibble: 10 × 4\n#> geo_value time_value cases slide_value_cases\n#> * \n#> 1 ca 2020-03-01 6 6\n#> 2 ca 2020-03-02 4 10\n#> 3 ca 2020-03-03 6 16\n#> 4 ca 2020-03-04 11 27\n#> 5 ca 2020-03-05 10 37\n#> 6 ca 2020-03-06 18 55\n#> # ℹ 4 more rows\n```\n:::\n:::\n\n\n## General sliding with a formula\n\nThe previous computations can also be performed using `epi_slide()`, which can\nbe used for more general sliding computations (but is much slower for the\nspecific cases of mean and sum).\n\nThe same 7-day trailing average of daily cases can be computed by passing in a\nformula for the first argument of `epi_slide()`:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-6_a5a7685ae8752595826c00ed5173f752'}\n\n```{.r .cell-code}\nedf %>%\n group_by(geo_value) %>%\n epi_slide(~ mean(.x$cases, na.rm = TRUE), .window_size = 7) %>%\n ungroup() %>%\n head(10)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 10 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:48.296938\n#> \n#> # A tibble: 10 × 4\n#> geo_value time_value cases slide_value\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4 more rows\n```\n:::\n:::\n\n\nIf your formula returns a data.frame, then the columns of the data.frame\nwill be unpacked into the resulting `epi_df`. For example, the following\ncomputes the 7-day trailing average of daily cases and the 7-day trailing sum of\ndaily cases:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-7_3f4404872d142bf1fb425e712231a143'}\n\n```{.r .cell-code}\nedf %>%\n group_by(geo_value) %>%\n epi_slide(\n ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sum = sum(.x$cases, na.rm = TRUE)),\n .window_size = 7\n ) %>%\n ungroup() %>%\n head(10)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 10 x 5 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:48.296938\n#> \n#> # A tibble: 10 × 5\n#> geo_value time_value cases cases_mean cases_sum\n#> * \n#> 1 ca 2020-03-01 6 6 6\n#> 2 ca 2020-03-02 4 5 10\n#> 3 ca 2020-03-03 6 5.33 16\n#> 4 ca 2020-03-04 11 6.75 27\n#> 5 ca 2020-03-05 10 7.4 37\n#> 6 ca 2020-03-06 18 9.17 55\n#> # ℹ 4 more rows\n```\n:::\n:::\n\n\nNote that this formula has access to all non-grouping columns present in the\noriginal `epi_df` object and must refer to them with the prefix `.x$...`. As we\ncan see, the function `epi_slide()` returns an `epi_df` object with a new column\nappended that contains the results (from sliding), named `slide_value` as the\ndefault.\n\nSome other information is available in additional variables:\n\n* `.group_key` is a one-row tibble containing the values of the grouping\n variables for the associated group\n* `.ref_time_value` is the reference time value the time window was based on\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-8_6fef540b7403e96758a8385c45ac0b68'}\n\n```{.r .cell-code}\n# Returning geo_value in the formula\nedf %>%\n group_by(geo_value) %>%\n epi_slide(~ .x$geo_value[[1]], .window_size = 7) %>%\n ungroup() %>%\n head(10)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 10 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:48.296938\n#> \n#> # A tibble: 10 × 4\n#> geo_value time_value cases slide_value\n#> * \n#> 1 ca 2020-03-01 6 ca \n#> 2 ca 2020-03-02 4 ca \n#> 3 ca 2020-03-03 6 ca \n#> 4 ca 2020-03-04 11 ca \n#> 5 ca 2020-03-05 10 ca \n#> 6 ca 2020-03-06 18 ca \n#> # ℹ 4 more rows\n```\n:::\n\n```{.r .cell-code}\n# Returning time_value in the formula\nedf %>%\n group_by(geo_value) %>%\n epi_slide(~ .x$time_value[[1]], .window_size = 7) %>%\n ungroup() %>%\n head(10)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 10 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:48.296938\n#> \n#> # A tibble: 10 × 4\n#> geo_value time_value cases slide_value\n#> * \n#> 1 ca 2020-03-01 6 2020-02-24 \n#> 2 ca 2020-03-02 4 2020-02-25 \n#> 3 ca 2020-03-03 6 2020-02-26 \n#> 4 ca 2020-03-04 11 2020-02-27 \n#> 5 ca 2020-03-05 10 2020-02-28 \n#> 6 ca 2020-03-06 18 2020-02-29 \n#> # ℹ 4 more rows\n```\n:::\n:::\n\n\nWhile the computations above do not look very useful, these can be used as\nbuilding blocks for computations that do something different depending on the\ngeo_value or ref_time_value.\n\n## Slide the tidy way\n\nPerhaps the most convenient way to setup a computation in `epi_slide()` is to\npass in an expression for tidy evaluation. In this case, we can simply define\nthe name of the new column directly as part of the expression, setting it equal\nto a computation in which we can access any columns of `.x` by name, just as we\nwould in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-9_099c56974dc1b1b9333fc886ce5e0236'}\n\n```{.r .cell-code}\nslide_output <- edf %>%\n group_by(geo_value) %>%\n epi_slide(cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7) %>%\n ungroup() %>%\n head(10)\n```\n:::\n\n\nIn addition to referring to individual columns by name, you can refer to\n`epi_df` time window as `.x` (`.group_key` and `.ref_time_value` are still\navailable). Also, the tidyverse \"pronouns\" `.data` and `.env` can also be used\nif you need distinguish between the data and environment.\n\nAs a simple sanity check, we visualize the 7-day trailing averages computed on\ntop of the original counts:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-10_4334cefaa041ba7576fa06309b28992d'}\n\n```{.r .cell-code}\nlibrary(ggplot2)\ntheme_set(theme_bw())\n\nggplot(slide_output, aes(x = time_value)) +\n geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) +\n geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-10-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAs we can see from the top right panel, it looks like Texas moved to weekly\nreporting of COVID-19 cases in summer of 2021.\n\n## Slide with a function\n\nWe can also pass a function to the second argument in `epi_slide()`. In this\ncase, the passed function `.f` must have the form `function(x, g, t, ...)`,\nwhere\n\n- \"x\" is an epi_df with the same column names as the archive's `DT`, minus\n the `version` column\n- \"g\" is a one-row tibble containing the values of the grouping variables\nfor the associated group\n- \"t\" is the ref_time_value for the current window\n- \"...\" are additional arguments\n\nRecreating the last example of a 7-day trailing average:\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-11_12a17a34c13a7e8307744518a41d53a4'}\n\n```{.r .cell-code}\nx <- edf %>%\n group_by(geo_value) %>%\n epi_slide(function(x, g, t) mean(x$cases, na.rm = TRUE), .window_size = 7, .new_col_name = \"cases_7dav\") %>%\n ungroup()\nx %>%\n head(10)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> An `epi_df` object, 10 x 4 with metadata:\n#> * geo_type = state\n#> * time_type = day\n#> * as_of = 2024-08-22 19:40:48.296938\n#> \n#> # A tibble: 10 × 4\n#> geo_value time_value cases cases_7dav\n#> * \n#> 1 ca 2020-03-01 6 6 \n#> 2 ca 2020-03-02 4 5 \n#> 3 ca 2020-03-03 6 5.33\n#> 4 ca 2020-03-04 11 6.75\n#> 5 ca 2020-03-05 10 7.4 \n#> 6 ca 2020-03-06 18 9.17\n#> # ℹ 4 more rows\n```\n:::\n:::\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-12_34e93e3b2dc10094fb15530dac8e7f0f'}\n\n```{.r .cell-code code-fold=\"true\"}\ncols <- RColorBrewer::brewer.pal(7, \"Set1\")[-6]\nggplot(x, aes(x = time_value)) +\n geom_col(aes(y = cases, fill = geo_value),\n alpha = 0.5,\n show.legend = FALSE\n ) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) +\n scale_fill_manual(values = cols) +\n scale_color_manual(values = cols) +\n facet_wrap(~geo_value, scales = \"free_y\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %Y\") +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-12-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAs we can see from the center top panel, it looks like Florida moved to weekly\nreporting of COVID-19 cases in summer of 2021, while California occasionally\nreported negative cases counts!\n\n## Running a local forecaster {#sec-local-forecaster}\n\nAs a more complex example, we preview some of the functionality of\n`{epipredict}` described in future chapters, and use a forecaster based on a\nlocal (in time) autoregression or \"AR model\". AR models can be fit in numerous\nways (using base R functions and various packages), but here we the\n`arx_forecaster()`, implemented in `{epipredict}` both provides a more advanced\nexample of sliding a function over an `epi_df` object, and it allows us to be a\nbit more flexible in defining a *probabilistic* forecaster: one that outputs not\njust a point prediction, but a notion of uncertainty around this. In particular,\nour forecaster will output a point prediction along with an 90\\% uncertainty\nband, represented by a predictive quantiles at the 5\\% and 95\\% levels (lower\nand upper endpoints of the uncertainty band).\n\nThe function signature below, is a probabilistic AR forecaster. The\n`lags` argument indicates which lags to use in the model, and `ahead` indicates\nhow far ahead in the future to make forecasts (both are encoded in terms of the\nunits of the `time_value` column; so, days, in the working `epi_df` being\nconsidered in this vignette).\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-13_190292c89295d7aec9312bafe5b82171'}\n\n```{.r .cell-code}\narx_forecaster <- function(\n epi_df,\n outcome, # the outcome column name in `epi_df`\n predictors, # a character vector, containing 1 or more predictors in `epi_df`\n trainer = quantile_reg(),\n args_list = arx_args_list(\n lags = c(0, 7, 14),\n ahead = 7,\n quantile_levels = c(0.05, 0.95)\n )) {\n ...\n}\n```\n:::\n\n\nWe go ahead and slide this AR forecaster over the working `epi_df` of COVID-19\ncases. Note that we actually model the `cases_7dav` column, to operate on the\nscale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant,\nto modeling weekly sums of COVID-19 cases.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-14_122fc4479578275735303f9c21ce9c5e'}\n\n```{.r .cell-code}\nfc_time_values <- seq(\n from = as.Date(\"2020-06-01\"),\n to = as.Date(\"2021-12-01\"),\n by = \"1 months\"\n)\n\nfcasts <- epi_slide(\n x,\n .f = ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = 7)\n )$predictions,\n .window_size = 120,\n .ref_time_values = fc_time_values\n)\n\n# grab just the relevant columns, and make them easier to plot\nfcasts <- fcasts %>%\n select(geo_value, time_value, cases_7dav, .pred, .pred_distn) %>%\n pivot_quantiles_wider(\".pred_distn\")\nfcasts\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n#> # A tibble: 114 × 7\n#> # Groups: geo_value [6]\n#> geo_value time_value cases_7dav .pred `0.05` `0.5` `0.95`\n#> \n#> 1 ca 2020-06-01 2694 2332. 2266. 2332. 2957.\n#> 2 ca 2020-07-01 6722 7979. 7081. 7979. 8999.\n#> 3 ca 2020-08-01 8284. 7339. 6745. 7339. 7630.\n#> 4 ca 2020-09-01 4707. 3291. 3264. 3291. 7571.\n#> 5 ca 2020-10-01 3360. 4270. 3213. 4270. 5714.\n#> 6 ca 2020-11-01 4441. 4172. 4028. 4172. 5491.\n#> # ℹ 108 more rows\n```\n:::\n:::\n\n\nNote that we have used the argument `.ref_time_values` to compute the forecast\nat a specific subset of reference time values. We get out 4 new columns:\n`fc_target_date`, `0.05`, `0.5`, `0.95` that correspond to the date the forecast\nis for (rather than the date it was made on), the point forecast, and the lower\nand upper endpoints of the 95\\% prediction band.[^1]\n\n[^1]: If instead we had set `as_list_col = TRUE` in the call to `epi_slide()`,\nthen we would have gotten a list column `fc`, where each element of `fc`\ncontains these results.\n\nTo finish off, we plot the forecasts at some times (spaced out by a few months)\nover the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do\nso, we encapsulate the process of generating forecasts into a simple function,\nso that we can call it a few times.\n\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-15_70aa61239ab94d7efb5385e12b4cccb7'}\n\n```{.r .cell-code}\nk_week_ahead <- function(ahead = 7) {\n epi_slide(\n x,\n ~ arx_forecaster(\n epi_data = .x,\n outcome = \"cases_7dav\",\n predictors = \"cases_7dav\",\n trainer = quantile_reg(),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n .window_size = 120,\n .ref_time_values = fc_time_values\n ) %>%\n select(geo_value, time_value, cases_7dav, .pred, .pred_distn) %>%\n pivot_quantiles_wider(\".pred_distn\")\n}\n\n# First generate the forecasts, and bind them together\nz <- map(c(7, 14, 21, 28), k_week_ahead) %>% list_rbind()\n```\n:::\n\n::: {.cell layout-align=\"center\" hash='slide_cache/html/unnamed-chunk-16_b4bf5f3aa24918e1188f8225b7963187'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(z) +\n geom_line(data = x, aes(x = time_value, y = cases_7dav), color = \"gray50\") +\n geom_ribbon(aes(\n x = time_value, ymin = `0.05`, ymax = `0.95`,\n group = time_value, fill = geo_value\n ), alpha = 0.4) +\n geom_line(aes(x = time_value, y = `0.5`, group = time_value)) +\n geom_point(aes(x = time_value, y = `0.5`, group = time_value), size = 0.5) +\n # geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x),\n # linetype = 2, alpha = 0.5) +\n facet_wrap(vars(geo_value), scales = \"free_y\", nrow = 3) +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n scale_x_date(minor_breaks = \"1 months\", date_labels = \"%b %Y\") +\n scale_fill_viridis_d(guide = \"none\", end = .9) +\n labs(x = \"Date\", y = \"Reported COVID-19 cases\")\n```\n\n::: {.cell-output-display}\n![](slide_files/figure-html/unnamed-chunk-16-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nTwo points are worth making. First, the AR model's performance here is pretty\nspotty. At various points in time, we can see that its forecasts are volatile\n(its point predictions are all over the place), or overconfident (its bands are\ntoo narrow), or both at the same time. This is only meant as a simple demo and\nnot entirely unexpected given the way the AR model is set up. The\n[`epipredict`](https://cmu-delphi.github.io/epipredict) package,\noffers a suite of predictive modeling tools\nthat improve on many of the shortcomings of the above simple AR model (simply\nusing all states for training rather than 6 is a huge improvement).\n\nSecond, the AR forecaster here is using finalized data, meaning, it uses the\nlatest versions of signal values (reported COVID-19 cases) available, for both\ntraining models and making predictions historically. However, this is not\nreflective of the provisional nature of the data that it must cope with in a\ntrue forecast task. Training and making predictions on finalized data can lead\nto an overly optimistic sense of accuracy; see, for example,\n[@McDonaldBien2021] and references\ntherein. Fortunately, the `epiprocess` package provides a data structure called\n`epi_archive` that can be used to store all data revisions, and furthermore, an\n`epi_archive` object knows how to slide computations in the correct\nversion-aware sense (for the computation at each reference time $t$, it uses\nonly data that would have been available as of $t$). We will revisit this\nexample in the [archive\nvignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html).\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/slide/figure-html/unnamed-chunk-10-1.svg b/_freeze/slide/figure-html/unnamed-chunk-10-1.svg new file mode 100644 index 0000000..5345f6d --- /dev/null +++ b/_freeze/slide/figure-html/unnamed-chunk-10-1.svg @@ -0,0 +1,308 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_freeze/slide/figure-html/unnamed-chunk-12-1.svg b/_freeze/slide/figure-html/unnamed-chunk-12-1.svg index 1c5a420..3df4513 100644 --- a/_freeze/slide/figure-html/unnamed-chunk-12-1.svg +++ b/_freeze/slide/figure-html/unnamed-chunk-12-1.svgdiff --git a/_freeze/slide/figure-html/unnamed-chunk-16-1.svg b/_freeze/slide/figure-html/unnamed-chunk-16-1.svg new file mode 100644 index 0000000..59929b8 --- /dev/null +++ b/_freeze/slide/figure-html/unnamed-chunk-16-1.svgdiff --git a/_freeze/slide/figure-html/unnamed-chunk-8-1.svg b/_freeze/slide/figure-html/unnamed-chunk-8-1.svg index ad8c8f0..9cc05d8 100644 --- a/_freeze/slide/figure-html/unnamed-chunk-8-1.svg +++ b/_freeze/slide/figure-html/unnamed-chunk-8-1.svgdiff --git a/_freeze/sliding-forecasters/execute-results/html.json b/_freeze/sliding-forecasters/execute-results/html.json index 474dbe2..74fa0c2 100644 --- a/_freeze/sliding-forecasters/execute-results/html.json +++ b/_freeze/sliding-forecasters/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "bcaf5b1ddd41612a320d992385364b2b", + "hash": "a74b5381d41b5132ec6860cad9e9e252", "result": { - "markdown": "# Pseudo-prospective forecast inspection\n\n\n::: {.cell}\n\n:::\n\n\n\nA key function from the epiprocess package is `epi_slide()`, which allows the\nuser to apply a function or formula-based computation over variables in an\n`epi_df` over a running window of `n` time steps (see the following `epiprocess`\nvignette to go over the basics of the function: [\"Slide a computation over\nsignal values\"](https://cmu-delphi.github.io/epiprocess/articles/slide.html)).\nThe equivalent sliding method for an `epi_archive` object can be called by using\nthe wrapper function `epix_slide()` (refer to the following vignette for the\nbasics of the function: [\"Work with archive objects and data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)). The\nkey difference from `epi_slide()` is that it performs version-aware\ncomputations. That is, the function only uses data that would have been\navailable as of time t for that reference time.\n\nIn this vignette, we use `epi_slide()` and `epix_slide()` for backtesting our\n`arx_forecaster` on historical COVID-19 case data from the US and from Canada.\nMore precisely, we first demonstrate using `epi_slide()` to slide ARX\nforecasters over an `epi_df` object and compare the results obtained from using\ndifferent forecasting engines. We then compare these simple retrospective\nforecasts to more proper \"pseudoprospective\" forecasts generated using snapshots\nof the data that was available in real time, using `epix_slide()`.\n\n## Comparing different forecasting engines\n\n### Example using CLI and case data from US states\n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_89a9d4079f8ffc6080f83369668b2316'}\n\n```{.r .cell-code}\nus_raw_history_dfs <- readRDS(url(\n \"https://github.com/cmu-delphi/epipredict/raw/dev/vignettes/articles/all_states_covidcast_signals.rds\"\n))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nAfter obtaining the latest snapshot of the data, we produce forecasts on that\ndata using the default engine of simple linear regression and compare to a\nrandom forest.\n\nNote that all of the warnings about the forecast date being less than the most\nrecent update date of the data have been suppressed to avoid cluttering the\noutput.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_404b2b076aee47ac4ed54f2b9ba369d2'}\n\n```{.r .cell-code}\n# Latest snapshot of data, and forecast dates\nus_latest <- epix_as_of(us_archive, max_version = max(us_archive$versions_end))\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nk_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) {\n epi_slide(epi_df, ~ arx_forecaster(\n .x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n select(-geo_value),\n before = 120L - 1L,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n select(geo_value, time_value, starts_with(\"fc\")) %>%\n mutate(engine_type = engine$engine)\n}\n\n# Generate the forecasts and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"), .x,\n engine = rand_forest(mode = \"regression\")\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands.\n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states,\nwe'll just display the\nresults for two states, California (CA) and Florida (FL), to get a sense of the\nmodel performance while keeping the graphic simple.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_c99e5bb41b438e998b3904771b96568d'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not\nresult in such in overconfident predictions (overly narrow confidence bands).\nThough, in general, neither approach produces amazingly accurate forecasts.\nThis could be because\nthe behaviour is rather different across states and the effects of other notable\nfactors such as age and public health measures may be important to account for\nin such forecasting. Including such factors as well as making enhancements such\nas correcting for outliers are some improvements one could make to this simple\nmodel.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to out-perform many far more complicated models in the online Covid forecasting due to those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_731a38443595452707f8f146a49636dd'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- epidatasets::can_prov_cases\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version))\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg()\n )) %>%\n list_rbind(),\n map(aheads, ~ k_week_ahead(\n can_latest, \"cr_7dav\", \"cr_7dav\", .x,\n boost_tree(mode = \"regression\", trees = 20)\n )) %>%\n list_rbind()\n) %>%\n pivot_quantiles_wider(contains(\"_distn\"))\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_a609b4a2e0dd0f49e145eb8a6b3ff50e'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_c606f4fff3ff4eb8919817c6e7100441'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = fc_target_date, group = time_value)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest, aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Pseudoprospective vs. unfaithful retrospective forecasting\n\n### Example using case data from US states\n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. That is, we can\nmake forecasts on the archive, `us_archive`, and compare those to forecasts on\n(time windows of) the latest data, `us_latest`, using the same general set-up as\nabove. For pseudoprospective forecasting, note that `us_archive` is fed into\n`epix_slide()`, while for simpler (unfaithful) retrospective forecasting,\n`us_latest` is fed into `epi_slide()`. #%% update to include percent_cli after\nthat issue is fixed?\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-ar-kweek-asof_bb2b98563c892027582a2aa804662dcc'}\n\n```{.r .cell-code}\nk_week_versioning <- function(ahead, version = c(\"faithful\", \"unfaithful\")) {\n version <- match.arg(version)\n if (version == \"faithful\") {\n epix_slide(\n us_archive,\n ~ arx_forecaster(\n .x, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n args_list = arx_args_list(ahead = ahead)\n )$predictions,\n before = 120 - 1,\n ref_time_values = fc_time_values,\n new_col_name = \"fc\"\n ) %>%\n mutate(version = \"version faithful\") %>%\n rename(geo_value = \"fc_geo_value\")\n } else {\n k_week_ahead(\n us_latest, \"case_rate\", c(\"case_rate\", \"percent_cli\"),\n ahead, linear_reg()\n ) %>% mutate(version = \"not version faithful\")\n }\n}\n\n# Generate the forecasts, and bind them together\nfc <- bind_rows(\n map(aheads, ~ k_week_versioning(.x, \"faithful\")) %>% list_rbind(),\n map(aheads, ~ k_week_versioning(.x, \"unfaithful\")) %>% list_rbind()\n) %>% pivot_quantiles_wider(fc_.pred_distn)\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_e484487727d34e88d3d51c13eeb6cdaa'}\n\n```{.r .cell-code code-fold=\"true\"}\nfc_cafl <- fc %>% filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_cafl <- us_latest %>% filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(fc_cafl, aes(x = fc_target_date, group = time_value)) +\n geom_line(\n data = latest_cafl, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = version), alpha = 0.4) +\n geom_line(aes(y = fc_.pred)) +\n geom_point(aes(y = fc_.pred), size = 0.5) +\n geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n scale_fill_brewer(palette = \"Set1\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", + "markdown": "# Sliding version-unaware and version-aware ARX forecasters across dates\n\n\n::: {.cell}\n\n:::\n\n\nA key function from the epiprocess package is `epix_slide()` (refer to the\nfollowing vignette for the basics of the function: [\"Work with archive objects\nand data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html))\nwhich allows performing version-aware computations. That is, the function only\nuses data that would have been available as of time t for that reference time.\n\nIn this vignette, we use `epix_slide()` for backtesting our `arx_forecaster` on\nhistorical COVID-19 case data from the US and from Canada. We first examine the\nresults from a version-unaware forecaster, comparing two different fitting\nengines and then we contrast this with version-aware forecasting. The former\nwill proceed by constructing an `epi_archive` that erases its version\ninformation and then use `epix_slide()` to forecast the future. The latter will\nkeep the versioned data and proceed similarly by using `epix_slide()` to\nforecast the future.\n\n## Version-unaware forecasting\n\n### Example using CLI and case data from US states\n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_89a9d4079f8ffc6080f83369668b2316'}\n\n```{.r .cell-code}\nus_raw_history_dfs <- readRDS(url(\n \"https://github.com/cmu-delphi/epipredict/raw/dev/vignettes/articles/all_states_covidcast_signals.rds\"\n))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nWe then get latest snapshot of the data from the archive by using\n`epix_as_of()`. We then create fake version information by setting `version =\ntime_value`. This creates an archive that pretends to have the latest data\navailable (since at version time `x` it has all the data up to time_value `x`,\nwhich in reality is unrealistic because the time values of the data received at\nversion time `x` often lags by a few days, not to mention the later corrections\nthat are amended to the data).\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_6ff04c287f0d7a0f9d4503649e56bd3a'}\n\n```{.r .cell-code}\n# Latest snapshot of data, and forecast dates\nus_latest <- us_archive %>%\n epix_as_of(version = max(.$versions_end)) %>%\n mutate(version = time_value) %>%\n as_epi_archive()\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nforecast_k_week_ahead <- function(epi_archive, outcome, predictors, ahead = 7, engine) {\n epi_archive %>%\n epix_slide(\n .f = function(x, gk, rtv) {\n arx_forecaster(\n x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n mutate(engine_type = engine$engine) %>%\n pivot_quantiles_wider(.pred_distn)\n },\n .before = 120,\n .versions = fc_time_values\n )\n}\n\n# Generate the forecasts and bind them together\nforecasts_version_unaware <- bind_rows(\n map(aheads, ~ forecast_k_week_ahead(\n us_latest,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = linear_reg()\n )),\n map(aheads, ~ forecast_k_week_ahead(\n us_latest,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = rand_forest(mode = \"regression\")\n ))\n)\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands.\n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states, we'll just\ndisplay the results for two states, California (CA) and Florida (FL), to get a\nsense of the model performance while keeping the graphic simple.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_2918e9947b5ecabca1115f6cc2d8eb62'}\n\n```{.r .cell-code code-fold=\"true\"}\nforecasts_filtered <- forecasts_version_unaware %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_data_filtered <- us_latest$DT %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = engine_type)) +\n geom_line(\n data = latest_data_filtered, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not result\nin such in overconfident predictions (overly narrow confidence bands). Though,\nin general, neither approach produces amazingly accurate forecasts. This could\nbe because the behaviour is rather different across states and the effects of\nother notable factors such as age and public health measures may be important to\naccount for in such forecasting. Including such factors as well as making\nenhancements such as correcting for outliers are some improvements one could\nmake to this simple model.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to\nout-perform many far more complicated models in the online Covid forecasting due\nto those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_2457088f4bfc3bada5f7c38814504be7'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- epidatasets::can_prov_cases\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version)) %>%\n mutate(version = time_value) %>%\n as_epi_archive()\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(\n aheads,\n ~ forecast_k_week_ahead(can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg())\n ),\n map(\n aheads,\n ~ forecast_k_week_ahead(can_latest, \"cr_7dav\", \"cr_7dav\", .x, boost_tree(mode = \"regression\", trees = 20))\n )\n)\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_f7e7878c3f1a72f4cb9216d68aa63292'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = target_date, group = forecast_date)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_58502edbabde6914bca10e407c6f445f'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = target_date, group = forecast_date)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Version-aware forecasting\n\n### Example using case data from US states\n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. All we have to\ndo is use the historical archive of the data with version information,\n`us_archive`, instead of `us_latest` like we did above, in the argument to our\nforecaster wrapper `forecast_k_week_ahead()`. Below we do that computation, tag\nit, and combine it with the forecasts from one of the engines made above.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-ar-kweek-asof_4b8247ffc26ffb4f84a5646852522f3b'}\n\n```{.r .cell-code}\n# Generate the forecasts, and bind them together\nforecasts_version_aware <- bind_rows(\n map(aheads, ~ forecast_k_week_ahead(\n us_archive,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = linear_reg()\n )) %>%\n bind_rows() %>%\n mutate(version = \"version faithful\"),\n forecasts_version_unaware %>% filter(engine_type == \"lm\") %>% mutate(version = \"version unfaithful\")\n)\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will\nonly display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_884abfb3ac2d5a8dfe9bc404ff34c5d4'}\n\n```{.r .cell-code code-fold=\"true\"}\nforecasts_filtered <- forecasts_version_aware %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_data_filtered <- us_latest$DT %>%\n tibble() %>%\n select(-version) %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = version)) +\n geom_line(\n data = latest_data_filtered, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg index 7ffb5a0..c21dcfc 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svgdiff --git a/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg b/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg index 6c1d4fc..a33628d 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-arx-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-arx-1.svgdiff --git a/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg b/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg index 2c407fd..63e1091 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-can-fc-boost-1.svgdiff --git a/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg b/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg index f082558..5df52bb 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-can-fc-lr-1.svgdiff --git a/archive.qmd b/archive.qmd index 36e24f0..2a552c6 100644 --- a/archive.qmd +++ b/archive.qmd @@ -16,8 +16,8 @@ claims, available through the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This signal is subject to very heavy and regular revision; you can read more about it on its [API documentation -page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`. - +page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). +We'll use the offline version stored in `{epidatasets}`. ```{r, include=FALSE} source("_common.R") @@ -36,7 +36,7 @@ tibble, provided that it has (at least) the following columns: the data for January 14, 2022 that were available one day later. As we can see from the above, the data frame returned by -`epidatr::covidcast()` has the columns required for the `epi_archive` +`epidatr::pub_covidcast()` has the columns required for the `epi_archive` format, so we use `as_epi_archive()` to cast it into `epi_archive` format.[^1] @@ -47,7 +47,7 @@ to the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/co ```{r} x <- archive_cases_dv_subset_dt %>% - select(geo_value, time_value, version, percent_cli) %>% + select(geo_value, time_value, version, percent_cli) %>% as_epi_archive(compactify = TRUE) class(x) @@ -70,8 +70,8 @@ below). There can only be a single row per unique combination of key variables, and therefore the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version (also described below). -```{r, error=TRUE} -key(x$DT) +```{r} +data.table::key(x$DT) ``` In general, the last version of each observation is carried forward (LOCF) to @@ -100,7 +100,7 @@ the signal variables as of a given version. This can be accessed via `epix_as_of()`. ```{r} -x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of(x, version = as.Date("2021-06-01")) class(x_snapshot) x_snapshot max(x_snapshot$time_value) @@ -120,7 +120,7 @@ this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, version = max(x$DT$version)) ``` Below, we pull several snapshots from the archive, spaced one month apart. We @@ -134,8 +134,9 @@ versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map( versions, function(v) { - epix_as_of(x, max_version = v) %>% mutate(version = v) - }) %>% + epix_as_of(x, version = v) %>% mutate(version = v) + } +) %>% list_rbind() %>% bind_rows(x_latest %>% mutate(version = self_max)) %>% mutate(latest = version == self_max) @@ -143,18 +144,22 @@ snapshots <- map( ```{r, fig.height=7} #| code-fold: true -ggplot(snapshots %>% filter(!latest), - aes(x = time_value, y = percent_cli)) + +ggplot( + snapshots %>% filter(!latest), + aes(x = time_value, y = percent_cli) +) + geom_line(aes(color = factor(version)), na.rm = TRUE) + geom_vline(aes(color = factor(version), xintercept = version), lty = 2) + - facet_wrap(~ geo_value, scales = "free_y", ncol = 1) + + facet_wrap(~geo_value, scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %Y") + scale_color_viridis_d(option = "A", end = .9) + labs(x = "Date", y = "% of doctor's visits with CLI") + theme(legend.position = "none") + - geom_line(data = snapshots %>% filter(latest), - aes(x = time_value, y = percent_cli), - inherit.aes = FALSE, color = "black", na.rm = TRUE) + geom_line( + data = snapshots %>% filter(latest), + aes(x = time_value, y = percent_cli), + inherit.aes = FALSE, color = "black", na.rm = TRUE + ) ``` We can see some interesting and highly nontrivial revision behavior: at some @@ -164,7 +169,6 @@ they overestimate it (both states towards the beginning of 2021), though not quite as dramatically. Modeling the revision process, which is often called *backfill modeling*, is an important statistical problem in it of itself. - ## Merging `epi_archive` objects Now we demonstrate how to merge two `epi_archive` objects together, e.g., so diff --git a/epidf.qmd b/epidf.qmd index 73d17bc..0e1f654 100644 --- a/epidf.qmd +++ b/epidf.qmd @@ -5,7 +5,7 @@ source("_common.R") ``` -We'll start by showing how to get data into +We'll start by showing how to get data into `epi_df`, which is just a tibble with a bit of special structure, and is the format assumed by all of the functions in the `epiprocess` package. An `epi_df` object has (at least) the @@ -43,16 +43,13 @@ cases <- pub_covidcast( colnames(cases) ``` -As we can see, a data frame returned by `epidatr::covidcast()` has the +As we can see, a data frame returned by `epidatr::pub_covidcast()` has the columns required for an `epi_df` object (along with many others). We can use `as_epi_df()`, with specification of some relevant metadata, to bring the data frame into `epi_df` format. ```{r, message = FALSE} -x <- as_epi_df(cases, - geo_type = "state", - time_type = "day", - as_of = max(cases$issue)) %>% +x <- as_epi_df(cases, as_of = max(cases$issue)) %>% select(geo_value, time_value, total_cases = value) class(x) @@ -64,7 +61,7 @@ attributes(x)$metadata ## Some details on metadata In general, an `epi_df` object has the following fields in its metadata: - + * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. * `as_of`: the time value at which the given data were available. @@ -86,10 +83,10 @@ data set. See the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for more. -If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a +If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a call to `as_epi_df()`, then this function will try to infer them from the passed object. Usually, `geo_type` and `time_type` can be inferred from the `geo_value` -and `time_value` columns, respectively, but inferring the `as_of` field is not +and `time_value` columns, respectively, but inferring the `as_of` field is not as easy. See the documentation for `as_epi_df()` more details. ```{r} @@ -109,25 +106,29 @@ In the following examples we will show how to create an `epi_df` with additional set.seed(12345) ex1 <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c("06059", "06061", "06067", "12111", "12113", "12117", - "42101", "42103", "42105"), + county_code = c( + "06059", "06061", "06067", "12111", "12113", "12117", + "42101", "42103", "42105" + ), time_value = rep( - seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"), - length.out = 9), + seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"), + length.out = 9 + ), value = rpois(9, 5) -) %>% +) %>% as_tsibble(index = time_value, key = c(geo_value, county_code)) -ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03") +ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03") ``` The metadata now includes `county_code` as an extra key. + ```{r} attr(ex1, "metadata") ``` -### Dealing with misspecified column names +### Dealing with misspecified column names `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. @@ -136,27 +137,27 @@ ex2 <- data.frame( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key reported_date = rep( - seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), - length.out = 9), # misnamed + seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), + length.out = 9 + ), # misnamed value = rpois(9, 5) -) -ex2 %>% as_epi_df() +) +ex2 %>% as_epi_df() ``` -The columns should be renamed to match `epi_df` format. +The columns should be renamed to match `epi_df` format. ```{r} -ex2 <- ex2 %>% +ex2 <- ex2 %>% rename(geo_value = state, time_value = reported_date) %>% - as_epi_df(geo_type = "state", - as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol") + as_epi_df( + as_of = "2020-06-03", + other_keys = "pol" ) attr(ex2, "metadata") ``` - ### Adding additional keys to an `epi_df` object In the above examples, all the keys are added to objects prior to conversion to @@ -166,22 +167,23 @@ We'll look at an included dataset and filter to a single state for simplicity. ```{r} ex3 <- jhu_csse_county_level_subset %>% filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% - slice_tail(n = 6) - + slice_tail(n = 6) + attr(ex3, "metadata") # geo_type is county currently ``` -Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The "state" `geo_type` anticipates lower-case abbreviations, so we'll match that. +Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The "state" `geo_type` anticipates lower-case abbreviations, so we'll match that. ```{r} -ex3 <- ex3 %>% +ex3 <- ex3 %>% as_tibble() %>% # drop the `epi_df` class before adding additional metadata mutate( state = rep(tolower("MA"), 6), - pol = rep(c("blue", "swing", "swing"), each = 2)) %>% - as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) + pol = rep(c("blue", "swing", "swing"), each = 2) + ) %>% + as_epi_df(other_keys = c("state", "pol")) -attr(ex3,"metadata") +attr(ex3, "metadata") ``` Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys. @@ -199,17 +201,17 @@ package. Of course, we can also write custom code for other downstream uses, like plotting, which is pretty easy to do `ggplot2`. ```{r, message = FALSE, warning = FALSE} -ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + +ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + geom_line() + scale_color_brewer(palette = "Set1") + scale_x_date(minor_breaks = "month", date_labels = "%b %Y") + labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State") ``` -Finally, we'll examine some data from other packages just to show how -we might get them into `epi_df` format. -The first is data on daily new (not cumulative) SARS -cases in Canada in 2003, from the +Finally, we'll examine some data from other packages just to show how +we might get them into `epi_df` format. +The first is data on daily new (not cumulative) SARS +cases in Canada in 2003, from the [outbreaks](https://github.com/reconverse/outbreaks) package. New cases are broken into a few categories by provenance. @@ -217,20 +219,20 @@ broken into a few categories by provenance. x <- outbreaks::sars_canada_2003 %>% mutate(geo_value = "ca") %>% select(geo_value, time_value = date, starts_with("cases")) %>% - as_epi_df(geo_type = "nation") + as_epi_df() head(x) ``` ```{r} #| code-fold: true -x <- x %>% +x <- x %>% pivot_longer(starts_with("cases"), names_to = "type") %>% mutate(type = substring(type, 7)) ggplot(x, aes(x = time_value, y = value)) + geom_col(aes(fill = type), just = 0.5) + - scale_y_continuous(breaks = 0:4*2, expand = expansion(c(0, 0.05))) + + scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) + scale_x_date(minor_breaks = "month", date_labels = "%b %Y") + labs(x = "Date", y = "SARS cases in Canada", fill = "Type") ``` @@ -243,27 +245,30 @@ x <- outbreaks::ebola_sierraleone_2014 %>% cases = ifelse(status == "confirmed", 1, 0), province = case_when( district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern", - district %in% c("Bombali", "Kambia", "Koinadugu", - "Port Loko", "Tonkolili") ~ "Northern", + district %in% c( + "Bombali", "Kambia", "Koinadugu", + "Port Loko", "Tonkolili" + ) ~ "Northern", district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern", - district %in% c("Western Rural", "Western Urban") ~ "Western") - ) %>% + district %in% c("Western Rural", "Western Urban") ~ "Western" + ) + ) %>% select(geo_value = province, time_value = date_of_onset, cases) %>% filter(cases == 1) %>% - group_by(geo_value, time_value) %>% + group_by(geo_value, time_value) %>% summarise(cases = sum(cases)) %>% - as_epi_df(geo_type = "province") + as_epi_df() ``` ```{r} #| code-fold: true #| fig-width: 8 #| fig-height: 6 -ggplot(x, aes(x = time_value, y = cases)) + - geom_col(aes(fill = geo_value), show.legend = FALSE) + - facet_wrap(~ geo_value, scales = "free_y") + +ggplot(x, aes(x = time_value, y = cases)) + + geom_col(aes(fill = geo_value), show.legend = FALSE) + + facet_wrap(~geo_value, scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %Y") + - labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") + labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") ``` diff --git a/outliers.qmd b/outliers.qmd index 16c66e8..dc68d52 100644 --- a/outliers.qmd +++ b/outliers.qmd @@ -12,12 +12,12 @@ source("_common.R") ``` ```{r} -x <- incidence_num_outlier_example +incidence_num_outlier_example ``` ```{r, warning=FALSE, message=FALSE} #| code-fold: true -ggplot(x, aes(x = time_value, y = cases, color = geo_value)) + +ggplot(incidence_num_outlier_example, aes(x = time_value, y = cases, color = geo_value)) + geom_line() + scale_color_manual(values = c(3, 6)) + geom_hline(yintercept = 0, linetype = 3) + @@ -36,13 +36,13 @@ methods on a given signal, and then (optionally) combine the results from those methods. Here, we'll investigate outlier detection results from the following methods. -1. Detection based on a rolling median, using `detect_outlr_rm()`, which - computes a rolling median on with a default window size of `n` time points - centered at the time point under consideration, and then computes thresholds - based on a multiplier times a rolling IQR computed on the residuals. +1. Detection based on a rolling median, using `detect_outlr_rm()`, which + computes a rolling median on with a default window size of `n` time points + centered at the time point under consideration, and then computes thresholds + based on a multiplier times a rolling IQR computed on the residuals. 2. Detection based on a seasonal-trend decomposition using LOESS (STL), using - `detect_outlr_stl()`, which is similar to the rolling median method but - replaces the rolling median with fitted values from STL. + `detect_outlr_stl()`, which is similar to the rolling median method but + replaces the rolling median with fitted values from STL. 3. Detection based on an STL decomposition, but without seasonality term, which amounts to smoothing using LOESS. @@ -50,7 +50,7 @@ The outlier detection methods are specified using a `tibble` that is passed to `detect_outlr()`, with one row per method, and whose columms specify the outlier detection function, any input arguments (only nondefault values need to be supplied), and an abbreviated name for the method used in tracking results. -Abbreviations "rm" and "stl" can be used for the built-in detection functions +Abbreviations "rm" and "stl" can be used for the built-in detection functions `detect_outlr_rm()` and `detect_outlr_stl()`, respectively. ```{r} @@ -63,12 +63,7 @@ detection_methods = bind_rows( args = list(list(detect_negatives = TRUE, detection_multiplier = 2.5, seasonal_period = 7)), - abbr = "stl_seasonal"), - tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = NULL)), - abbr = "stl_nonseasonal")) + abbr = "stl_seasonal")) detection_methods ``` @@ -79,7 +74,7 @@ Note that using this combined median threshold is equivalent to using a majority vote across the base methods to determine whether a value is an outlier. ```{r} -x <- x %>% +x <- incidence_num_outlier_example %>% group_by(geo_value) %>% mutate( outlier_info = detect_outlr( @@ -87,51 +82,51 @@ x <- x %>% methods = detection_methods, combiner = "median") ) %>% - ungroup() %>% - unnest(outlier_info) + unpack(outlier_info) %>% + ungroup() x ``` -To visualize the results, we define a convenience function for and call it on +To visualize the results, we define a convenience function for and call it on each state separately (hidden below the fold). ```{r} #| code-fold: true # Plot outlier detection bands and/or points identified as outliers plot_outlr <- function( - x, signal, method_abbr, bands = TRUE, points = TRUE, + x, signal, method_abbr, bands = TRUE, points = TRUE, facet_vars = vars(geo_value), nrow = NULL, ncol = NULL, scales = "fixed") { - - # Convert outlier detection results to long format + + # Convert outlier detection results to long format signal <- rlang::enquo(signal) x_long <- x %>% pivot_longer( cols = starts_with(method_abbr), names_to = c("method", ".value"), names_pattern = "(.+)_(.+)") - + # Start of plot with observed data p <- ggplot() + geom_line(data = x, mapping = aes(x = time_value, y = !!signal)) # If requested, add bands - if (bands) - p <- p + geom_ribbon(data = x_long, - aes(x = time_value, ymin = lower, ymax = upper, + if (bands) + p <- p + geom_ribbon(data = x_long, + aes(x = time_value, ymin = lower, ymax = upper, color = method), fill = NA) # If requested, add points if (points) { x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper)) - p <- p + geom_point(data = x_detected, - aes(x = time_value, y = !!signal, color = method, + p <- p + geom_point(data = x_detected, + aes(x = time_value, y = !!signal, color = method, shape = method)) } # If requested, add faceting - if (!is.null(facet_vars)) + if (!is.null(facet_vars)) p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales) return(p) @@ -159,7 +154,7 @@ plot_outlr(x %>% filter(geo_value == "nj"), cases, method_abbr, labs(x = "Date", y = "Reported COVID-19 counts", color = "Method", shape = "Method") + scale_color_brewer(palette = "Set1") + - ggtitle("New Jersey") + + ggtitle("New Jersey") + theme(legend.position = "bottom") ``` @@ -167,13 +162,13 @@ plot_outlr(x %>% filter(geo_value == "nj"), cases, method_abbr, Finally, in order to correct outliers, we can use the posited replacement values returned by each outlier detection method. Below we use the replacement value -from the combined method, which is defined by the median of replacement values +from the combined method, which is defined by the median of replacement values from the base methods at each time point. ```{r, fig.width = 8, fig.height = 7} -y <- x %>% +y <- x %>% mutate(cases_corrected = combined_replacement) %>% - select(geo_value, time_value, cases, cases_corrected) + select(geo_value, time_value, cases, cases_corrected) y %>% filter(cases != cases_corrected) ``` @@ -193,6 +188,6 @@ y %>% theme(legend.position = "bottom", legend.title = element_blank()) ``` -More advanced correction functionality will be coming at some point in the -future. +More advanced correction functionality will be coming at some point in the +future. diff --git a/renv.lock b/renv.lock index ad975e4..59ee76b 100644 --- a/renv.lock +++ b/renv.lock @@ -18,14 +18,14 @@ }, "DBI": { "Package": "DBI", - "Version": "1.2.2", + "Version": "1.2.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "164809cd72e1d5160b4cb3aa57f510fe" + "Hash": "065ae649b05f1ff66bb0c793107508f5" }, "DiceDesign": { "Package": "DiceDesign", @@ -50,14 +50,14 @@ }, "KernSmooth": { "Package": "KernSmooth", - "Version": "2.23-22", + "Version": "2.23-24", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "stats" ], - "Hash": "2fecebc3047322fa5930f74fae5de70f" + "Hash": "9f33a1ee37bbe8919eb2ec4b9f2473a5" }, "MASS": { "Package": "MASS", @@ -98,19 +98,6 @@ ], "Hash": "8c7115cd3a0e048bda2a7cd110549f7a" }, - "MatrixModels": { - "Package": "MatrixModels", - "Version": "0.5-3", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "Matrix", - "R", - "methods", - "stats" - ], - "Hash": "0776bf7526869e0286b0463cb72fb211" - }, "R6": { "Package": "R6", "Version": "2.5.1", @@ -133,18 +120,18 @@ }, "Rcpp": { "Package": "Rcpp", - "Version": "1.0.12", + "Version": "1.0.13", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "methods", "utils" ], - "Hash": "5ea2700d21e038ace58269ecdbeb9ec0" + "Hash": "f27411eb6d9c3dada5edd444b8416675" }, "RcppEigen": { "Package": "RcppEigen", - "Version": "0.3.4.0.0", + "Version": "0.3.4.0.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -153,18 +140,18 @@ "stats", "utils" ], - "Hash": "df49e3306f232ec28f1604e36a202847" + "Hash": "4ac8e423216b8b70cb9653d1b3f71eb9" }, "RcppRoll": { "Package": "RcppRoll", - "Version": "0.3.0", + "Version": "0.3.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "Rcpp" ], - "Hash": "84a03997fbb5acfb3c9b43bad88fea1f" + "Hash": "6659c0ecb7b85f322f93e7f1e6ac7d35" }, "SQUAREM": { "Package": "SQUAREM", @@ -176,20 +163,6 @@ ], "Hash": "0cf10dab0d023d5b46a5a14387556891" }, - "SparseM": { - "Package": "SparseM", - "Version": "1.81", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "graphics", - "methods", - "stats", - "utils" - ], - "Hash": "2042cd9759cc89a453c4aefef0ce9aae" - }, "anytime": { "Package": "anytime", "Version": "0.3.9", @@ -214,13 +187,13 @@ }, "backports": { "Package": "backports", - "Version": "1.4.1", + "Version": "1.5.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "c39fbec8a30d23e721980b8afb31984c" + "Hash": "e1e1b9d75c37401117b636b7ae50827a" }, "base64enc": { "Package": "base64enc", @@ -234,7 +207,7 @@ }, "bayestestR": { "Package": "bayestestR", - "Version": "0.13.2", + "Version": "0.14.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -246,29 +219,23 @@ "stats", "utils" ], - "Hash": "4a6a2eebe2db1dfb1c792c4ed91e73dc" + "Hash": "71e7da5d38487173de67a1f0d763ceef" }, "bit": { "Package": "bit", - "Version": "4.0.5", + "Version": "4.5.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "bit", - "RemoteRef": "bit", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "4.0.5", "Requirements": [ "R" ], - "Hash": "d242abec29412ce988848d0294b208fd" + "Hash": "5dc7b2677d65d0e874fc4aaf0e879987" }, "bit64": { "Package": "bit64", - "Version": "4.0.5", + "Version": "4.5.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "bit", @@ -276,19 +243,13 @@ "stats", "utils" ], - "Hash": "9fe98599ca456d6552421db0d6772d8f" + "Hash": "e84984bf5f12a18628d9a02322128dfd" }, "blob": { "Package": "blob", "Version": "1.2.4", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "blob", - "RemoteRef": "blob", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.2.4", "Requirements": [ "methods", "rlang", @@ -298,14 +259,13 @@ }, "broom": { "Package": "broom", - "Version": "1.0.5", + "Version": "1.0.7", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "backports", "dplyr", - "ellipsis", "generics", "glue", "lifecycle", @@ -315,11 +275,11 @@ "tibble", "tidyr" ], - "Hash": "fd25391c3c4f6ecf0fa95f1e6d15378c" + "Hash": "8fcc818f3b9887aebaf206f141437cc9" }, "bslib": { "Package": "bslib", - "Version": "0.7.0", + "Version": "0.8.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -337,18 +297,18 @@ "rlang", "sass" ], - "Hash": "8644cc53f43828f19133548195d7e59e" + "Hash": "b299c6741ca9746fb227debcb0f9fb6c" }, "cachem": { "Package": "cachem", - "Version": "1.0.8", + "Version": "1.1.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "fastmap", "rlang" ], - "Hash": "c35768291560ce302c0a6589f92e837d" + "Hash": "cd9a672193789068eb5a2aad65a0dedf" }, "callr": { "Package": "callr", @@ -377,7 +337,7 @@ }, "checkmate": { "Package": "checkmate", - "Version": "2.3.1", + "Version": "2.3.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -385,7 +345,7 @@ "backports", "utils" ], - "Hash": "c01cab1cb0f9125211a6fc99d540e315" + "Hash": "0e14e01ce07e7c88fd25de6d4260d26b" }, "class": { "Package": "class", @@ -402,14 +362,14 @@ }, "cli": { "Package": "cli", - "Version": "3.6.2", + "Version": "3.6.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52" + "Hash": "b21916dd77a27642b447374a5d30ecf3" }, "clipr": { "Package": "clipr", @@ -423,7 +383,7 @@ }, "clock": { "Package": "clock", - "Version": "0.7.0", + "Version": "0.7.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -435,7 +395,7 @@ "tzdb", "vctrs" ], - "Hash": "3d8a84cdf9f6f8564531c49b70f3833d" + "Hash": "3dcaebd52554438d12989e5061e15de8" }, "codetools": { "Package": "codetools", @@ -449,7 +409,7 @@ }, "colorspace": { "Package": "colorspace", - "Version": "2.1-0", + "Version": "2.1-1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -459,7 +419,7 @@ "methods", "stats" ], - "Hash": "f20c47fd52fae58b4e377c37bb8c335b" + "Hash": "d954cb1c57e8d8b756165d7ba18aa55a" }, "conflicted": { "Package": "conflicted", @@ -476,7 +436,7 @@ }, "correlation": { "Package": "correlation", - "Version": "0.8.4", + "Version": "0.8.5", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -488,35 +448,29 @@ "parameters", "stats" ], - "Hash": "d8bd29a9abda6eed9aaab3ba5769f231" + "Hash": "0995955fd59a01caf80918913bc5066c" }, "cpp11": { "Package": "cpp11", - "Version": "0.4.7", + "Version": "0.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "5a295d7d963cc5035284dcdbaf334f4e" + "Hash": "91570bba75d0c9d3f1040c835cee8fba" }, "crayon": { "Package": "crayon", - "Version": "1.5.2", + "Version": "1.5.3", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "crayon", - "RemoteRef": "crayon", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.5.2", "Requirements": [ "grDevices", "methods", "utils" ], - "Hash": "e8a1e41acf02548751f45c718d55aa6a" + "Hash": "859d96e65ef198fd43e82b9628d593ef" }, "credentials": { "Package": "credentials", @@ -534,28 +488,28 @@ }, "curl": { "Package": "curl", - "Version": "5.2.1", + "Version": "5.2.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" + "Hash": "d91263322a58af798f6cf3b13fd56dde" }, "data.table": { "Package": "data.table", - "Version": "1.15.4", + "Version": "1.16.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "methods" ], - "Hash": "8ee9ac56ef633d0c7cab8b2ca87d683e" + "Hash": "fb24e05d4a91d8b1c7ff8e284bde834a" }, "datawizard": { "Package": "datawizard", - "Version": "0.10.0", + "Version": "0.12.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -564,7 +518,7 @@ "stats", "utils" ], - "Hash": "62d6ec10346d3302a1299e1c54641d83" + "Hash": "611537168bbb78b57720de109ec1ad19" }, "dbplyr": { "Package": "dbplyr", @@ -612,12 +566,6 @@ "Version": "1.6.5", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "diagram", - "RemoteRef": "diagram", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.6.5", "Requirements": [ "R", "graphics", @@ -628,7 +576,7 @@ }, "dials": { "Package": "dials", - "Version": "1.2.1", + "Version": "1.3.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -643,39 +591,56 @@ "purrr", "rlang", "scales", + "sfd", "tibble", "utils", "vctrs", "withr" ], - "Hash": "999e5fa12058a2bb3a8c204e637e4707" + "Hash": "f2fbe4e90fab23fc1f95bffcd3662878" + }, + "diffobj": { + "Package": "diffobj", + "Version": "0.3.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "crayon", + "methods", + "stats", + "tools", + "utils" + ], + "Hash": "bcaa8b95f8d7d01a5dedfd959ce88ab8" }, "digest": { "Package": "digest", - "Version": "0.6.35", + "Version": "0.6.37", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "698ece7ba5a4fa4559e3d537e7ec3d31" + "Hash": "33698c4b3127fc9f506654607fb73676" }, "distributional": { "Package": "distributional", - "Version": "0.4.0", + "Version": "0.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "generics", "lifecycle", "numDeriv", + "pillar", "rlang", "stats", "utils", "vctrs" ], - "Hash": "3bad76869f2257ea4fd00a3c08c2bcce" + "Hash": "76e94de462aa18ea966a38956ecf4497" }, "doFuture": { "Package": "doFuture", @@ -721,12 +686,6 @@ "Version": "1.3.1", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "dtplyr", - "RemoteRef": "dtplyr", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.3.1", "Requirements": [ "R", "cli", @@ -743,7 +702,7 @@ }, "effectsize": { "Package": "effectsize", - "Version": "0.8.7", + "Version": "0.8.9", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -756,7 +715,7 @@ "stats", "utils" ], - "Hash": "a6900d0b5bdcbb956dbb843643279e7c" + "Hash": "7aceb5e07b6d48171c6b56714cc305ea" }, "ellipsis": { "Package": "ellipsis", @@ -774,36 +733,33 @@ "Version": "0.0.1", "Source": "GitHub", "RemoteType": "github", - "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epidatasets", "RemoteRef": "main", - "RemoteSha": "ca86f0326e4eb08316b40972c7d3c98217e9941e", - "Remotes": "cmu-delphi/epidatr, cmu-delphi/epiprocess", + "RemoteSha": "0632a77dc30655bbbb8c9667d7365f99ad0d5622", + "RemoteHost": "api.github.com", "Requirements": [ "R" ], - "Hash": "3deba70da0ce06354cbd3206b16e36a2" + "Hash": "d8715113bd6e6fbbddb664144d999dd0" }, "epidatr": { "Package": "epidatr", - "Version": "1.1.5", + "Version": "1.2.0", "Source": "GitHub", "RemoteType": "github", - "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epidatr", "RemoteRef": "dev", - "RemoteSha": "626c30bc07f4aae3c3e6a6c6b825a6cd5eee1ce7", + "RemoteSha": "0b3480889091063e5b03358cea10670292a803e6", + "RemoteHost": "api.github.com", "Requirements": [ "MMWRweek", "R", "cachem", - "cachem", "checkmate", "cli", "glue", - "glue", "httr", "jsonlite", "magrittr", @@ -813,22 +769,20 @@ "readr", "tibble", "usethis", - "usethis", "xml2" ], - "Hash": "869d57a2ad4002670ad28939fe050e82" + "Hash": "3ad6e3cc0f0a1ff4b1e976b00ba3654d" }, "epipredict": { "Package": "epipredict", - "Version": "0.0.14", + "Version": "0.0.24", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epipredict", "RemoteRef": "dev", - "RemoteSha": "5e50a5a112b663eff85fcac5586875352157a5c4", - "Remotes": "cmu-delphi/epidatr, cmu-delphi/epiprocess, dajmcdon/smoothqr", + "RemoteSha": "36c4c0a88f77861302b35e95b815609f9014e90d", "Requirements": [ "R", "checkmate", @@ -843,10 +797,8 @@ "lifecycle", "magrittr", "parsnip", - "quantreg", "recipes", "rlang", - "smoothqr", "stats", "tibble", "tidyr", @@ -855,31 +807,27 @@ "vctrs", "workflows" ], - "Hash": "4531cf03e3c8955857df663d7366a8f4" + "Hash": "2015c74d601879eaeb391c269cb7551d" }, "epiprocess": { "Package": "epiprocess", - "Version": "0.7.7", + "Version": "0.9.0", "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", "RemoteUsername": "cmu-delphi", "RemoteRepo": "epiprocess", "RemoteRef": "dev", - "RemoteSha": "4e65e51bb56ab70cc98fa2d37dd35a4ab2336620", - "Remotes": "cmu-delphi/epidatr, reconverse/outbreaks, glmgen/genlasso", + "RemoteSha": "44e70950a0e3c3c2bd8da52e5234dc505d99bb00", "Requirements": [ "R", - "R6", "checkmate", "cli", "data.table", "dplyr", - "fabletools", - "feasts", - "generics", "genlasso", "ggplot2", + "glue", "lifecycle", "lubridate", "magrittr", @@ -891,47 +839,20 @@ "tidyselect", "tsibble", "utils", - "vctrs" + "vctrs", + "waldo" ], - "Hash": "998ba22373923380e1ce7e787d11af18" + "Hash": "057dc098224b8288fdbaf47e9fd5ed86" }, "evaluate": { "Package": "evaluate", - "Version": "0.23", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "methods" - ], - "Hash": "daf4a1246be12c1fa8c7705a0935c1a0" - }, - "fabletools": { - "Package": "fabletools", - "Version": "0.4.2", + "Version": "1.0.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "R", - "R6", - "distributional", - "dplyr", - "generics", - "ggdist", - "ggplot2", - "lifecycle", - "progressr", - "rlang", - "scales", - "stats", - "tibble", - "tidyr", - "tidyselect", - "tsibble", - "utils", - "vctrs" + "R" ], - "Hash": "005e92a674b01825e0feb29931c03c5e" + "Hash": "6b567375113ceb7d9f800de4dd42218e" }, "fansi": { "Package": "fansi", @@ -947,42 +868,17 @@ }, "farver": { "Package": "farver", - "Version": "2.1.1", + "Version": "2.1.2", "Source": "Repository", "Repository": "RSPM", - "Hash": "8106d78941f34855c440ddb946b8f7a5" + "Hash": "680887028577f3fa2a81e410ed0d6e42" }, "fastmap": { "Package": "fastmap", - "Version": "1.1.1", - "Source": "Repository", - "Repository": "RSPM", - "Hash": "f7736a18de97dea803bde0a2daaafb27" - }, - "feasts": { - "Package": "feasts", - "Version": "0.3.2", + "Version": "1.2.0", "Source": "Repository", "Repository": "RSPM", - "Requirements": [ - "R", - "dplyr", - "fabletools", - "ggplot2", - "grid", - "gtable", - "lifecycle", - "lubridate", - "rlang", - "scales", - "slider", - "tibble", - "tidyr", - "tsibble", - "utils", - "vctrs" - ], - "Hash": "d15631c019c27e50b1a99e3e9b3b53e1" + "Hash": "aa5e1cd11c2d15497494c5292d7ffcc8" }, "fontawesome": { "Package": "fontawesome", @@ -1001,12 +897,6 @@ "Version": "1.0.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "forcats", - "RemoteRef": "forcats", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.0.0", "Requirements": [ "R", "cli", @@ -1047,12 +937,6 @@ "Version": "0.3.1", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "furrr", - "RemoteRef": "furrr", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "0.3.1", "Requirements": [ "R", "future", @@ -1066,7 +950,7 @@ }, "future": { "Package": "future", - "Version": "1.33.2", + "Version": "1.34.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1077,7 +961,7 @@ "parallelly", "utils" ], - "Hash": "fd7b1d69d16d0d114e4fa82db68f184c" + "Hash": "475771e3edb711591476be387c9a8c2e" }, "future.apply": { "Package": "future.apply", @@ -1145,7 +1029,7 @@ }, "gert": { "Package": "gert", - "Version": "2.0.1", + "Version": "2.1.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1156,31 +1040,7 @@ "sys", "zip" ], - "Hash": "f70d3fe2d9e7654213a946963d1591eb" - }, - "ggdist": { - "Package": "ggdist", - "Version": "3.3.2", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "Rcpp", - "cli", - "distributional", - "ggplot2", - "glue", - "grid", - "gtable", - "numDeriv", - "quadprog", - "rlang", - "scales", - "tibble", - "vctrs", - "withr" - ], - "Hash": "86ebb3543cdad6520be9bf8863167a9a" + "Hash": "347d104ed332650b737f509a703c9c7f" }, "ggplot2": { "Package": "ggplot2", @@ -1315,12 +1175,6 @@ "Version": "1.0.1", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "gower", - "RemoteRef": "gower", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.0.1", "Hash": "7a0051eef852c301b5efe2f7913dd45f" }, "gtable": { @@ -1340,7 +1194,7 @@ }, "hardhat": { "Package": "hardhat", - "Version": "1.3.1", + "Version": "1.4.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1351,7 +1205,7 @@ "tibble", "vctrs" ], - "Hash": "921fd010cd788de75a9c71c2c3aee1f2" + "Hash": "e7aabf81944f6c6cbbcec1f85827a279" }, "haven": { "Package": "haven", @@ -1376,26 +1230,20 @@ }, "highr": { "Package": "highr", - "Version": "0.10", + "Version": "0.11", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "xfun" ], - "Hash": "06230136b2d2b9ba5805e1963fa6e890" + "Hash": "d65ba49117ca223614f71b60d85b8ab7" }, "hms": { "Package": "hms", "Version": "1.1.3", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "hms", - "RemoteRef": "hms", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.1.3", "Requirements": [ "lifecycle", "methods", @@ -1438,7 +1286,7 @@ }, "httr2": { "Package": "httr2", - "Version": "1.0.1", + "Version": "1.0.5", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1455,7 +1303,7 @@ "vctrs", "withr" ], - "Hash": "03d741c92fda96d98c3a3f22494e3b4a" + "Hash": "d84e4c33206aaace37714901ac2b00c3" }, "ids": { "Package": "ids", @@ -1526,7 +1374,7 @@ }, "insight": { "Package": "insight", - "Version": "0.19.10", + "Version": "0.20.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1535,19 +1383,13 @@ "stats", "utils" ], - "Hash": "c15a38c9655cba66f5f5537a14c1bef4" + "Hash": "8457d6e682a49f2c87b698a830527b09" }, "ipred": { "Package": "ipred", - "Version": "0.9-14", + "Version": "0.9-15", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "ipred", - "RemoteRef": "ipred", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "0.9-14", "Requirements": [ "MASS", "R", @@ -1557,7 +1399,7 @@ "rpart", "survival" ], - "Hash": "b25a108cbf4834be7c1b1f46ff30f888" + "Hash": "3c3e02183ef7b9225213b531d0ce43f5" }, "isoband": { "Package": "isoband", @@ -1593,17 +1435,17 @@ }, "jsonlite": { "Package": "jsonlite", - "Version": "1.8.8", + "Version": "1.8.9", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "methods" ], - "Hash": "e1b9c55281c5adc4dd113652d9e26768" + "Hash": "4e993b65c2c3ffbffce7bb3e2c6f832b" }, "knitr": { "Package": "knitr", - "Version": "1.46", + "Version": "1.48", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1615,7 +1457,7 @@ "xfun", "yaml" ], - "Hash": "6e008ab1d696a5283c79765fa7b56b47" + "Hash": "acf380f300c721da9fde7df115a5f86f" }, "labeling": { "Package": "labeling", @@ -1666,20 +1508,14 @@ }, "lhs": { "Package": "lhs", - "Version": "1.1.6", + "Version": "1.2.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "lhs", - "RemoteRef": "lhs", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "1.1.6", "Requirements": [ "R", "Rcpp" ], - "Hash": "a007ff66aa9d478e220bf0493a7b1d95" + "Hash": "6d18e58d3d1de31b6e5415c1fe291113" }, "lifecycle": { "Package": "lifecycle", @@ -1767,7 +1603,7 @@ }, "modelbased": { "Package": "modelbased", - "Version": "0.8.7", + "Version": "0.8.8", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1782,11 +1618,11 @@ "stats", "utils" ], - "Hash": "857859a5dd55f53a2c6ab14fbdb6acc1" + "Hash": "be0465e9a8078f1c5a15344a2c130266" }, "modeldata": { "Package": "modeldata", - "Version": "1.3.0", + "Version": "1.4.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1797,7 +1633,7 @@ "rlang", "tibble" ], - "Hash": "6ac8ee87ffebd14b29586fce684c14cc" + "Hash": "a88b3cef9f6a41e075163e767ad8c8fa" }, "modelenv": { "Package": "modelenv", @@ -1817,12 +1653,6 @@ "Version": "0.1.11", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "modelr", - "RemoteRef": "modelr", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "0.1.11", "Requirements": [ "R", "broom", @@ -1849,7 +1679,7 @@ }, "nlme": { "Package": "nlme", - "Version": "3.1-164", + "Version": "3.1-166", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1859,7 +1689,7 @@ "stats", "utils" ], - "Hash": "a623a2239e642806158bc4dc3f51565d" + "Hash": "ccbb8846be320b627e6aa2b4616a2ded" }, "nnet": { "Package": "nnet", @@ -1885,13 +1715,13 @@ }, "openssl": { "Package": "openssl", - "Version": "2.1.2", + "Version": "2.2.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "askpass" ], - "Hash": "ea2475b073243d9d338aa8f086ce973e" + "Hash": "d413e0fef796c9401a4419485f709ca1" }, "outbreaks": { "Package": "outbreaks", @@ -1905,7 +1735,7 @@ }, "parallelly": { "Package": "parallelly", - "Version": "1.37.1", + "Version": "1.38.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1913,11 +1743,11 @@ "tools", "utils" ], - "Hash": "5410df8d22bd36e616f2a2343dbb328c" + "Hash": "6e8b139c1904f5e9e14c69db64453bbe" }, "parameters": { "Package": "parameters", - "Version": "0.21.6", + "Version": "0.22.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1930,7 +1760,7 @@ "stats", "utils" ], - "Hash": "1f1bf75cb49c61df8287a0fa3b68126f" + "Hash": "ee0115da94a9cf7c451615415ce65c03" }, "parsnip": { "Package": "parsnip", @@ -1963,11 +1793,12 @@ }, "patchwork": { "Package": "patchwork", - "Version": "1.2.0", + "Version": "1.3.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "cli", + "farver", "ggplot2", "grDevices", "graphics", @@ -1977,11 +1808,11 @@ "stats", "utils" ], - "Hash": "9c8ab14c00ac07e9e04d1664c0b74486" + "Hash": "e23fb9ecb1258207bcb763d78d513439" }, "performance": { "Package": "performance", - "Version": "0.11.0", + "Version": "0.12.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1992,7 +1823,7 @@ "stats", "utils" ], - "Hash": "eb8ecde248cd610ae3097f5d00718cbd" + "Hash": "92be9503bc3394c464688fb6b03002e3" }, "pillar": { "Package": "pillar", @@ -2065,7 +1896,7 @@ }, "prodlim": { "Package": "prodlim", - "Version": "2023.08.28", + "Version": "2024.06.25", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2080,7 +1911,7 @@ "stats", "survival" ], - "Hash": "c73e09a2039a0f75ac0a1e5454b39993" + "Hash": "d1e73a231e9442c29e21876f303382fc" }, "progress": { "Package": "progress", @@ -2110,14 +1941,14 @@ }, "ps": { "Package": "ps", - "Version": "1.7.6", + "Version": "1.8.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "dd2b9319ee0656c8acf45c7f40c59de7" + "Hash": "4b9c8485b0c7eecdf0a9ba5132a45576" }, "purrr": { "Package": "purrr", @@ -2134,44 +1965,16 @@ ], "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" }, - "quadprog": { - "Package": "quadprog", - "Version": "1.5-8", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R" - ], - "Hash": "5f919ae5e7f83a6f91dcf2288943370d" - }, - "quantreg": { - "Package": "quantreg", - "Version": "5.97", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "MASS", - "Matrix", - "MatrixModels", - "R", - "SparseM", - "graphics", - "methods", - "stats", - "survival" - ], - "Hash": "1bbc97f7d637ab3917c514a69047b2c1" - }, "ragg": { "Package": "ragg", - "Version": "1.3.0", + "Version": "1.3.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "systemfonts", "textshaping" ], - "Hash": "082e1a198e3329d571f4448ef0ede4bc" + "Hash": "0595fe5e47357111f29ad19101c7d271" }, "ranger": { "Package": "ranger", @@ -2236,7 +2039,7 @@ }, "recipes": { "Package": "recipes", - "Version": "1.0.10", + "Version": "1.1.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2245,7 +2048,6 @@ "cli", "clock", "dplyr", - "ellipsis", "generics", "glue", "gower", @@ -2265,7 +2067,7 @@ "vctrs", "withr" ], - "Hash": "69783cdd607c58fffb21c5c26c6ededf" + "Hash": "fc6672e55fcd1b5c461a3529ff6b1b08" }, "rematch": { "Package": "rematch", @@ -2286,17 +2088,17 @@ }, "renv": { "Package": "renv", - "Version": "1.0.7", + "Version": "1.0.9", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "utils" ], - "Hash": "397b7b2a265bc5a7a06852524dabae20" + "Hash": "ef233f0e9064fc88c898b340c9add5c2" }, "reprex": { "Package": "reprex", - "Version": "2.1.0", + "Version": "2.1.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2314,22 +2116,22 @@ "utils", "withr" ], - "Hash": "1425f91b4d5d9a8f25352c44a3d914ed" + "Hash": "97b1d5361a24d9fb588db7afe3e5bcbf" }, "rlang": { "Package": "rlang", - "Version": "1.1.3", + "Version": "1.1.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "42548638fae05fd9a9b5f3f437fbbbe2" + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1" }, "rmarkdown": { "Package": "rmarkdown", - "Version": "2.26", + "Version": "2.28", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2348,7 +2150,7 @@ "xfun", "yaml" ], - "Hash": "9b148e7f95d33aac01f31282d49e4f44" + "Hash": "062470668513dcda416927085ee9bdc7" }, "rpart": { "Package": "rpart", @@ -2460,7 +2262,7 @@ }, "see": { "Package": "see", - "Version": "0.8.4", + "Version": "0.9.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2478,7 +2280,7 @@ "performance", "stats" ], - "Hash": "3d2fd0b72314499e6af4fd20d39309dc" + "Hash": "743de04e180938d89e913f392dc9a104" }, "selectr": { "Package": "selectr", @@ -2506,6 +2308,19 @@ ], "Hash": "3f9796a8d0a0e8c6eb49a4b029359d1f" }, + "sfd": { + "Package": "sfd", + "Version": "0.1.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "rlang", + "tibble" + ], + "Hash": "8798f23058ead1d2ffd1223dfc0c8906" + }, "shape": { "Package": "shape", "Version": "1.4.6.1", @@ -2533,28 +2348,9 @@ ], "Hash": "a584625e2b9e4fad4be135c8ea5c99aa" }, - "smoothqr": { - "Package": "smoothqr", - "Version": "0.1.1", - "Source": "GitHub", - "RemoteType": "github", - "RemoteHost": "api.github.com", - "RemoteUsername": "dajmcdon", - "RemoteRepo": "smoothqr", - "RemoteRef": "main", - "RemoteSha": "3def5f0183671c1974676d08e469d538e15acea8", - "Requirements": [ - "cli", - "dplyr", - "quantreg", - "rlang", - "tibble" - ], - "Hash": "d7b8b29158f8d7a450e539d11c5c667b" - }, "stringi": { "Package": "stringi", - "Version": "1.8.3", + "Version": "1.8.4", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2563,7 +2359,7 @@ "tools", "utils" ], - "Hash": "058aebddea264f4c99401515182e656a" + "Hash": "39e1144fd75428983dc3f63aa53dfa91" }, "stringr": { "Package": "stringr", @@ -2584,7 +2380,7 @@ }, "survival": { "Package": "survival", - "Version": "3.6-4", + "Version": "3.7-0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2596,7 +2392,7 @@ "stats", "utils" ], - "Hash": "e6e3071f471513e4b85f98ca041303c7" + "Hash": "5aaa9cbaf4aba20f8e06fdea1850a398" }, "sys": { "Package": "sys", @@ -2607,26 +2403,28 @@ }, "systemfonts": { "Package": "systemfonts", - "Version": "1.0.6", + "Version": "1.1.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", - "cpp11" + "cpp11", + "lifecycle" ], - "Hash": "6d538cff441f0f1f36db2209ac7495ac" + "Hash": "213b6b8ed5afbf934843e6c3b090d418" }, "textshaping": { "Package": "textshaping", - "Version": "0.3.7", + "Version": "0.4.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "cpp11", + "lifecycle", "systemfonts" ], - "Hash": "997aac9ad649e0ef3b97f96cddd5622b" + "Hash": "5142f8bc78ed3d819d26461b641627ce" }, "tibble": { "Package": "tibble", @@ -2722,12 +2520,6 @@ "Version": "2.0.0", "Source": "Repository", "Repository": "RSPM", - "RemoteType": "standard", - "RemotePkgRef": "tidyverse", - "RemoteRef": "tidyverse", - "RemoteRepos": "https://packagemanager.rstudio.com/all/latest", - "RemotePkgPlatform": "source", - "RemoteSha": "2.0.0", "Requirements": [ "R", "broom", @@ -2765,7 +2557,7 @@ }, "timeDate": { "Package": "timeDate", - "Version": "4032.109", + "Version": "4041.110", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2775,7 +2567,7 @@ "stats", "utils" ], - "Hash": "fa276a2ec2555d74b4eabf56fba3d209" + "Hash": "c5e48e8ac24d4472ddb122bcdeb011ad" }, "timechange": { "Package": "timechange", @@ -2790,17 +2582,17 @@ }, "tinytex": { "Package": "tinytex", - "Version": "0.50", + "Version": "0.53", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "xfun" ], - "Hash": "be7a76845222ad20adb761f462eed3ea" + "Hash": "9db859e8aabbb474293dde3097839420" }, "tsibble": { "Package": "tsibble", - "Version": "1.1.4", + "Version": "1.1.5", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2817,7 +2609,7 @@ "tidyselect", "vctrs" ], - "Hash": "d5da786ac5a28f62ca2eb8255ad7b9f3" + "Hash": "a75e397766b45996310908b5b32557ba" }, "tune": { "Package": "tune", @@ -2866,7 +2658,7 @@ }, "usethis": { "Package": "usethis", - "Version": "2.2.3", + "Version": "3.0.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2893,7 +2685,7 @@ "withr", "yaml" ], - "Hash": "d524fd42c517035027f866064417d7e6" + "Hash": "b2fbf93c2127bedd2cbe9b799530d5d2" }, "utf8": { "Package": "utf8", @@ -2907,13 +2699,13 @@ }, "uuid": { "Package": "uuid", - "Version": "1.2-0", + "Version": "1.2-1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "303c19bfd970bece872f93a824e323d9" + "Hash": "34e965e62a41fcafb1ca60e9b142085b" }, "vctrs": { "Package": "vctrs", @@ -2965,6 +2757,23 @@ ], "Hash": "390f9315bc0025be03012054103d227c" }, + "waldo": { + "Package": "waldo", + "Version": "0.5.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "diffobj", + "glue", + "methods", + "rematch2", + "rlang", + "tibble" + ], + "Hash": "16aa934a49658677d8041df9017329b9" + }, "warp": { "Package": "warp", "Version": "0.2.1", @@ -2984,7 +2793,7 @@ }, "withr": { "Package": "withr", - "Version": "3.0.0", + "Version": "3.0.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -2992,7 +2801,7 @@ "grDevices", "graphics" ], - "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" + "Hash": "07909200e8bbe90426fbfeb73e1e27aa" }, "workflows": { "Package": "workflows", @@ -3046,19 +2855,20 @@ }, "xfun": { "Package": "xfun", - "Version": "0.43", + "Version": "0.47", "Source": "Repository", "Repository": "RSPM", "Requirements": [ + "R", "grDevices", "stats", "tools" ], - "Hash": "ab6371d8653ce5f2f9290f4ec7b42a8e" + "Hash": "36ab21660e2d095fef0d83f689e0477c" }, "xgboost": { "Package": "xgboost", - "Version": "1.7.7.1", + "Version": "1.7.8.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -3068,7 +2878,7 @@ "jsonlite", "methods" ], - "Hash": "6303e61eac62aef7bd2b396ef7e24386" + "Hash": "f7aa70849f72103d78c99df10eae6164" }, "xml2": { "Package": "xml2", @@ -3085,10 +2895,10 @@ }, "yaml": { "Package": "yaml", - "Version": "2.3.8", + "Version": "2.3.10", "Source": "Repository", "Repository": "RSPM", - "Hash": "29240487a071f535f5e5d5a323b7afbd" + "Hash": "51dab85c6c98e50a18d7551e9d49f76c" }, "yardstick": { "Package": "yardstick", diff --git a/renv/activate.R b/renv/activate.R index d13f993..c360bf2 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -2,7 +2,7 @@ local({ # the requested version of renv - version <- "1.0.7" + version <- "1.0.9" attr(version, "sha") <- NULL # the project directory @@ -98,6 +98,66 @@ local({ unloadNamespace("renv") # load bootstrap tools + ansify <- function(text) { + if (renv_ansify_enabled()) + renv_ansify_enhanced(text) + else + renv_ansify_default(text) + } + + renv_ansify_enabled <- function() { + + override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) + if (!is.na(override)) + return(as.logical(override)) + + pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) + if (identical(pane, "build")) + return(FALSE) + + testthat <- Sys.getenv("TESTTHAT", unset = "false") + if (tolower(testthat) %in% "true") + return(FALSE) + + iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") + if (tolower(iderun) %in% "false") + return(FALSE) + + TRUE + + } + + renv_ansify_default <- function(text) { + text + } + + renv_ansify_enhanced <- function(text) { + + # R help links + pattern <- "`\\?(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # runnable code + pattern <- "`(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # return ansified text + text + + } + + renv_ansify_init <- function() { + + envir <- renv_envir_self() + if (renv_ansify_enabled()) + assign("ansify", renv_ansify_enhanced, envir = envir) + else + assign("ansify", renv_ansify_default, envir = envir) + + } + `%||%` <- function(x, y) { if (is.null(x)) y else x } @@ -142,7 +202,10 @@ local({ # compute common indent indent <- regexpr("[^[:space:]]", lines) common <- min(setdiff(indent, -1L)) - leave - paste(substring(lines, common), collapse = "\n") + text <- paste(substring(lines, common), collapse = "\n") + + # substitute in ANSI links for executable renv code + ansify(text) } @@ -306,7 +369,11 @@ local({ ) if ("headers" %in% names(formals(utils::download.file))) - args$headers <- renv_bootstrap_download_custom_headers(url) + { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } do.call(utils::download.file, args) @@ -385,10 +452,22 @@ local({ for (type in types) { for (repos in renv_bootstrap_repos()) { + # build arguments for utils::available.packages() call + args <- list(type = type, repos = repos) + + # add custom headers if available -- note that + # utils::available.packages() will pass this to download.file() + if ("headers" %in% names(formals(utils::download.file))) + { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + # retrieve package database db <- tryCatch( as.data.frame( - utils::available.packages(type = type, repos = repos), + do.call(utils::available.packages, args), stringsAsFactors = FALSE ), error = identity @@ -470,6 +549,14 @@ local({ } + renv_bootstrap_github_token <- function() { + for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(envval) + } + } + renv_bootstrap_download_github <- function(version) { enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") @@ -477,16 +564,16 @@ local({ return(FALSE) # prepare download options - pat <- Sys.getenv("GITHUB_PAT") - if (nzchar(Sys.which("curl")) && nzchar(pat)) { + token <- renv_bootstrap_github_token() + if (nzchar(Sys.which("curl")) && nzchar(token)) { fmt <- "--location --fail --header \"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "curl", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) - } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + } else if (nzchar(Sys.which("wget")) && nzchar(token)) { fmt <- "--header=\"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "wget", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) diff --git a/slide.qmd b/slide.qmd index 3cd4a4f..f34529c 100644 --- a/slide.qmd +++ b/slide.qmd @@ -1,26 +1,22 @@ # Sliding computations {#sec-sliding} -A central tool in the `{epiprocess}` package is `epi_slide()`, which is based -on the powerful functionality provided in the +A central tool in the `{epiprocess}` package is `epi_slide()`, which is based on +the powerful functionality provided in the [`slider`](https://cran.r-project.org/web/packages/slider) package. In -`{epiprocess}`, to "slide" means to apply a computation---represented as a -function or formula---over a sliding/rolling data window. Suitable -groupings can always be achieved by a preliminary call to `group_by()`. - -By default, the meaning of one time step is inferred from the `time_value` -column of the `epi_df` object under consideration, based on the way this column -understands addition and subtraction. For example, if the time values are coded -as `Date` objects, then one time step is one day, since -`as.Date("2022-01-01") + 1` equals `as.Date("2022-01-02")`. Alternatively, the time step can be specified -manually in the call to `epi_slide()`; you can read the documentation for more -details. Furthermore, the alignment of the running window used in `epi_slide()` -can be "right", "center", or "left"; the default is "right", and is what we use -in this vignette. +`epiprocess`, to "slide" means to apply a computation---represented as a +function or formula---over a sliding/rolling data window. The function always +applies the slide inside each group and the grouping is assumed to be across all +group keys of the `epi_df` (this is the grouping used by default if you do not +group the `epi_df` with a `group_by()`). + +By default, the `.window_size` units depend on the `time_type` of the `epi_df`, +which is determined from the types in the `time_value` column of the `epi_df`. +See the "Details" in `epi_slide()` for more. As in getting started guide, we'll fetch daily reported COVID-19 cases from CA, FL, NY, and TX (note: here we're using new, not cumulative cases) using the -[`epidatr`](https://github.com/cmu-delphi/epidatr) package, -and then convert this to `epi_df` format. +[`epidatr`](https://github.com/cmu-delphi/epidatr) package, and then convert +this to `epi_df` format. ```{r} #| include: false @@ -37,98 +33,174 @@ The example data we'll use is part of the package and has 2,684 rows and 3 colum ```{r} data(jhu_csse_daily_subset) -x <- jhu_csse_daily_subset %>% +edf <- jhu_csse_daily_subset %>% select(geo_value, time_value, cases) %>% arrange(geo_value, time_value) %>% as_epi_df() ``` +## Optimized rolling mean and sums -## Slide with a formula - -We first demonstrate how to apply a 7-day trailing average to the daily cases in -order to smooth the signal, by passing in a formula for the first argument of -`epi_slide()`. To do this computation per state, we first call `group_by()`. +For the two most common sliding operations, we offer two optimized versions: +`epi_slide_mean()` and `epi_slide_sum()`. This example gets the 7-day trailing +average of the daily cases. Note that the name of the column(s) that we want to +average is specified as the first argument of `epi_slide_mean()`. ```{r} -x %>% - group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), before = 6) %>% - ungroup() +edf %>% + group_by(geo_value) %>% + epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) %>% + ungroup() %>% + head(10) ``` -The formula specified has access to all non-grouping columns present in the -original `epi_df` object (and must refer to them with the prefix `.x$`). As we -can see, the function `epi_slide()` returns an `epi_df` object with a new column -appended that contains the results (from sliding), named `slide_value` as the -default. We can of course change this post hoc, or we can instead specify a new -name up front using the `new_col_name` argument: +Note that we passed `na.rm = TRUE` to `data.table::frollmean()` via `...` to +`epi_slide_mean`. + +The following computes the 7-day trailing sum of daily cases (and passed `na.rm` +to `data.table::frollsum()` similarly): ```{r} -x %>% +edf %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), before = 6, new_col_name = "cases_7dav") %>% - ungroup() + epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) %>% + ungroup() %>% + head(10) ``` -Some other information is available in additional variables: +## General sliding with a formula -* `.group_key` is a one-row tibble containing the values of the grouping - variables for the associated group -* `.ref_time_value` is the reference time value the time window was based on +The previous computations can also be performed using `epi_slide()`, which can +be used for more general sliding computations (but is much slower for the +specific cases of mean and sum). -Like in `group_modify()`, there are alternative names for these variables as -well: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z` -instead of `.ref_time_value`. +The same 7-day trailing average of daily cases can be computed by passing in a +formula for the first argument of `epi_slide()`: -## Slide with a function +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide(~ mean(.x$cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) +``` -We can also pass a function for the first argument in `epi_slide()`. In this -case, the passed function must accept the following arguments: +If your formula returns a data.frame, then the columns of the data.frame +will be unpacked into the resulting `epi_df`. For example, the following +computes the 7-day trailing average of daily cases and the 7-day trailing sum of +daily cases: -In this case, the passed function `f` must accept the following arguments: a -data frame with the same column names as the original object, minus any grouping -variables, containing the time window data for one group-`ref_time_value` -combination; followed by a one-row tibble containing the values of the grouping -variables for the associated group; followed by the associated `ref_time_value`. -It can accept additional arguments; `epi_slide()` will forward any `...` args it -receives to `f`. +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide( + ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sum = sum(.x$cases, na.rm = TRUE)), + .window_size = 7 + ) %>% + ungroup() %>% + head(10) +``` -Recreating the last example of a 7-day trailing average: +Note that this formula has access to all non-grouping columns present in the +original `epi_df` object and must refer to them with the prefix `.x$...`. As we +can see, the function `epi_slide()` returns an `epi_df` object with a new column +appended that contains the results (from sliding), named `slide_value` as the +default. + +Some other information is available in additional variables: + +* `.group_key` is a one-row tibble containing the values of the grouping + variables for the associated group +* `.ref_time_value` is the reference time value the time window was based on ```{r} -x %>% - group_by(geo_value) %>% - epi_slide(function(x, gk, rtv) mean(x$cases), - before = 6, new_col_name = "cases_7dav") %>% - ungroup() +# Returning geo_value in the formula +edf %>% + group_by(geo_value) %>% + epi_slide(~ .x$geo_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) + +# Returning time_value in the formula +edf %>% + group_by(geo_value) %>% + epi_slide(~ .x$time_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) ``` +While the computations above do not look very useful, these can be used as +building blocks for computations that do something different depending on the +geo_value or ref_time_value. + ## Slide the tidy way Perhaps the most convenient way to setup a computation in `epi_slide()` is to pass in an expression for tidy evaluation. In this case, we can simply define the name of the new column directly as part of the expression, setting it equal -to a computation in which we can access any columns of `x` by name, just as we +to a computation in which we can access any columns of `.x` by name, just as we would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} -x <- x %>% - group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), before = 6) %>% - ungroup() +slide_output <- edf %>% + group_by(geo_value) %>% + epi_slide(cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) ``` -In addition to referring to individual columns by name, you can refer to the -time window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse "pronouns" `.data` and `.env` can also be used. + +In addition to referring to individual columns by name, you can refer to +`epi_df` time window as `.x` (`.group_key` and `.ref_time_value` are still +available). Also, the tidyverse "pronouns" `.data` and `.env` can also be used +if you need distinguish between the data and environment. As a simple sanity check, we visualize the 7-day trailing averages computed on -top of the original counts. +top of the original counts: + +```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} +library(ggplot2) +theme_set(theme_bw()) + +ggplot(slide_output, aes(x = time_value)) + + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + + geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + + facet_wrap(~geo_value, scales = "free_y") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + labs(x = "Date", y = "Reported COVID-19 cases") +``` + +As we can see from the top right panel, it looks like Texas moved to weekly +reporting of COVID-19 cases in summer of 2021. + +## Slide with a function + +We can also pass a function to the second argument in `epi_slide()`. In this +case, the passed function `.f` must have the form `function(x, g, t, ...)`, +where + +- "x" is an epi_df with the same column names as the archive's `DT`, minus + the `version` column +- "g" is a one-row tibble containing the values of the grouping variables +for the associated group +- "t" is the ref_time_value for the current window +- "..." are additional arguments + +Recreating the last example of a 7-day trailing average: + +```{r} +x <- edf %>% + group_by(geo_value) %>% + epi_slide(function(x, g, t) mean(x$cases, na.rm = TRUE), .window_size = 7, .new_col_name = "cases_7dav") %>% + ungroup() +x %>% + head(10) +``` ```{r, message = FALSE, warning = FALSE, fig.width = 8, fig.height = 6} #| code-fold: true cols <- RColorBrewer::brewer.pal(7, "Set1")[-6] ggplot(x, aes(x = time_value)) + - geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + scale_y_continuous(expand = expansion(c(0, 0.05))) + geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + @@ -139,23 +211,23 @@ ggplot(x, aes(x = time_value)) + labs(x = "Date", y = "Reported COVID-19 cases") ``` -As we can see from the center top panel, it looks like Florida moved to weekly -reporting of COVID-19 cases in summer of 2021, while California occasionally reported negative cases counts! +As we can see from the center top panel, it looks like Florida moved to weekly +reporting of COVID-19 cases in summer of 2021, while California occasionally +reported negative cases counts! ## Running a local forecaster {#sec-local-forecaster} -As a more complex example, we preview some of the functionality of `{epipredict}` described in future chapters, and use a forecaster based on a -local (in time) -autoregression or "AR model". AR models can be fit in numerous ways -(using base R -functions and various packages), but here we the `arx_forecaster()`, implemented in `{epipredict}` both -provides a more advanced example of sliding a function over an `epi_df` object, -and it allows us to be a bit more flexible in defining a *probabilistic* -forecaster: one that outputs not just a point prediction, but a notion of -uncertainty around this. In particular, our forecaster will output a point -prediction along with an 90\% uncertainty band, represented by a predictive -quantiles at the 5\% and 95\% levels (lower and upper endpoints of the -uncertainty band). +As a more complex example, we preview some of the functionality of +`{epipredict}` described in future chapters, and use a forecaster based on a +local (in time) autoregression or "AR model". AR models can be fit in numerous +ways (using base R functions and various packages), but here we the +`arx_forecaster()`, implemented in `{epipredict}` both provides a more advanced +example of sliding a function over an `epi_df` object, and it allows us to be a +bit more flexible in defining a *probabilistic* forecaster: one that outputs not +just a point prediction, but a notion of uncertainty around this. In particular, +our forecaster will output a point prediction along with an 90\% uncertainty +band, represented by a predictive quantiles at the 5\% and 95\% levels (lower +and upper endpoints of the uncertainty band). The function signature below, is a probabilistic AR forecaster. The `lags` argument indicates which lags to use in the model, and `ahead` indicates @@ -165,97 +237,93 @@ considered in this vignette). ```{r eval=FALSE} arx_forecaster <- function( - epi_df, + epi_df, outcome, # the outcome column name in `epi_df` predictors, # a character vector, containing 1 or more predictors in `epi_df` - trainer = quantile_reg(), + trainer = quantile_reg(), args_list = arx_args_list( - lags = c(0, 7, 14), + lags = c(0, 7, 14), ahead = 7, quantile_levels = c(0.05, 0.95) ) -) - +) { + ... +} ``` -We go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 -cases. Note that we actually model the `cases_7dav` column, to operate on the +We go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 +cases. Note that we actually model the `cases_7dav` column, to operate on the scale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant, to modeling weekly sums of COVID-19 cases. ```{r, warning=FALSE} fc_time_values <- seq( - from = as.Date("2020-06-01"), - to = as.Date("2021-12-01"), + from = as.Date("2020-06-01"), + to = as.Date("2021-12-01"), by = "1 months") fcasts <- epi_slide( - x, - ~ arx_forecaster( - epi_data = .x, - outcome = "cases_7dav", - predictors = "cases_7dav", - trainer = quantile_reg(), - args_list = arx_args_list(ahead = 7))$predictions, - before = 119, - ref_time_values = fc_time_values, - new_col_name = "fc") + x, + .f = ~ arx_forecaster( + epi_data = .x, + outcome = "cases_7dav", + predictors = "cases_7dav", + trainer = quantile_reg(), + args_list = arx_args_list(ahead = 7))$predictions, + .window_size = 120, + .ref_time_values = fc_time_values) # grab just the relevant columns, and make them easier to plot fcasts <- fcasts %>% - select(geo_value, time_value, cases_7dav, - contains("_distn"), fc_target_date) %>% - pivot_quantiles_wider(contains("_distn")) + select(geo_value, time_value, cases_7dav, .pred, .pred_distn) %>% + pivot_quantiles_wider(".pred_distn") fcasts ``` -Note that here we have used an argument `ref_time_values` to perform the -sliding computation (here, compute a forecast) at a specific subset of reference -time values. We get out 4 new columns: `fc_target_date`, `0.05`, `0.5`, `0.95` -that correspond to the date the forecast is for (rather than the date it was made on, the point forecast, and the lower and upper endpoints of the -95\% prediction band.[^1] +Note that we have used the argument `.ref_time_values` to compute the forecast +at a specific subset of reference time values. We get out 4 new columns: +`fc_target_date`, `0.05`, `0.5`, `0.95` that correspond to the date the forecast +is for (rather than the date it was made on), the point forecast, and the lower +and upper endpoints of the 95\% prediction band.[^1] -[^1]: If instead we had set `as_list_col = TRUE` -in the call to `epi_slide()`, then we would have gotten a list column `fc`, -where each element of `fc` contains these results. +[^1]: If instead we had set `as_list_col = TRUE` in the call to `epi_slide()`, +then we would have gotten a list column `fc`, where each element of `fc` +contains these results. To finish off, we plot the forecasts at some times (spaced out by a few months) -over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do -so, we encapsulate the process of generating forecasts into a simple function, +over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do +so, we encapsulate the process of generating forecasts into a simple function, so that we can call it a few times. ```{r, message = FALSE, warning = FALSE} k_week_ahead <- function(ahead = 7) { epi_slide( - x, + x, ~ arx_forecaster( - epi_data = .x, - outcome = "cases_7dav", - predictors = "cases_7dav", - trainer = quantile_reg(), - args_list = arx_args_list(ahead = ahead))$predictions, - before = 119, - ref_time_values = fc_time_values, - new_col_name = "fc") %>% - select(geo_value, time_value, cases_7dav, contains("_distn"), - fc_target_date) %>% - pivot_quantiles_wider(contains("_distn")) + epi_data = .x, + outcome = "cases_7dav", + predictors = "cases_7dav", + trainer = quantile_reg(), + args_list = arx_args_list(ahead = ahead))$predictions, + .window_size = 120, + .ref_time_values = fc_time_values) %>% + select(geo_value, time_value, cases_7dav, .pred, .pred_distn) %>% + pivot_quantiles_wider(".pred_distn") } # First generate the forecasts, and bind them together z <- map(c(7, 14, 21, 28), k_week_ahead) %>% list_rbind() ``` -Then we can plot the on top of the observed data ```{r, fig.width=8, fig.height=9} #| code-fold: true ggplot(z) + - geom_line(data = x, aes(x = time_value, y = cases_7dav), color = "gray50") + - geom_ribbon(aes(x = fc_target_date, ymin = `0.05`, ymax = `0.95`, - group = time_value, fill = geo_value), alpha = 0.4) + - geom_line(aes(x = fc_target_date, y = `0.5`, group = time_value)) + - geom_point(aes(x = fc_target_date, y = `0.5`, group = time_value), size = 0.5) + - #geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x), + geom_line(data = x, aes(x = time_value, y = cases_7dav), color = "gray50") + + geom_ribbon(aes(x = time_value, ymin = `0.05`, ymax = `0.95`, + group = time_value, fill = geo_value), alpha = 0.4) + + geom_line(aes(x = time_value, y = `0.5`, group = time_value)) + + geom_point(aes(x = time_value, y = `0.5`, group = time_value), size = 0.5) + + #geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x), # linetype = 2, alpha = 0.5) + facet_wrap(vars(geo_value), scales = "free_y", nrow = 3) + scale_y_continuous(expand = expansion(c(0, 0.05))) + @@ -269,9 +337,9 @@ spotty. At various points in time, we can see that its forecasts are volatile (its point predictions are all over the place), or overconfident (its bands are too narrow), or both at the same time. This is only meant as a simple demo and not entirely unexpected given the way the AR model is set up. The -[`epipredict`](https://cmu-delphi.github.io/epipredict) package, -offers a suite of predictive modeling tools -that improve on many of the shortcomings of the above simple AR model (simply +[`epipredict`](https://cmu-delphi.github.io/epipredict) package, +offers a suite of predictive modeling tools +that improve on many of the shortcomings of the above simple AR model (simply using all states for training rather than 6 is a huge improvement). Second, the AR forecaster here is using finalized data, meaning, it uses the @@ -279,12 +347,12 @@ latest versions of signal values (reported COVID-19 cases) available, for both training models and making predictions historically. However, this is not reflective of the provisional nature of the data that it must cope with in a true forecast task. Training and making predictions on finalized data can lead -to an overly optimistic sense of accuracy; see, for example, +to an overly optimistic sense of accuracy; see, for example, [@McDonaldBien2021] and references therein. Fortunately, the `epiprocess` package provides a data structure called `epi_archive` that can be used to store all data revisions, and furthermore, an `epi_archive` object knows how to slide computations in the correct version-aware sense (for the computation at each reference time $t$, it uses -only data that would have been available as of $t$). We will revisit this -example in the [archive +only data that would have been available as of $t$). We will revisit this +example in the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). diff --git a/sliding-forecasters.qmd b/sliding-forecasters.qmd index 8aee741..65d8802 100644 --- a/sliding-forecasters.qmd +++ b/sliding-forecasters.qmd @@ -1,33 +1,27 @@ -# Pseudo-prospective forecast inspection +# Sliding version-unaware and version-aware ARX forecasters across dates ```{r} #| echo: false source("_common.R") ``` +A key function from the epiprocess package is `epix_slide()` (refer to the +following vignette for the basics of the function: ["Work with archive objects +and data +revisions"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)) +which allows performing version-aware computations. That is, the function only +uses data that would have been available as of time t for that reference time. -A key function from the epiprocess package is `epi_slide()`, which allows the -user to apply a function or formula-based computation over variables in an -`epi_df` over a running window of `n` time steps (see the following `epiprocess` -vignette to go over the basics of the function: ["Slide a computation over -signal values"](https://cmu-delphi.github.io/epiprocess/articles/slide.html)). -The equivalent sliding method for an `epi_archive` object can be called by using -the wrapper function `epix_slide()` (refer to the following vignette for the -basics of the function: ["Work with archive objects and data -revisions"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)). The -key difference from `epi_slide()` is that it performs version-aware -computations. That is, the function only uses data that would have been -available as of time t for that reference time. - -In this vignette, we use `epi_slide()` and `epix_slide()` for backtesting our -`arx_forecaster` on historical COVID-19 case data from the US and from Canada. -More precisely, we first demonstrate using `epi_slide()` to slide ARX -forecasters over an `epi_df` object and compare the results obtained from using -different forecasting engines. We then compare these simple retrospective -forecasts to more proper "pseudoprospective" forecasts generated using snapshots -of the data that was available in real time, using `epix_slide()`. - -## Comparing different forecasting engines +In this vignette, we use `epix_slide()` for backtesting our `arx_forecaster` on +historical COVID-19 case data from the US and from Canada. We first examine the +results from a version-unaware forecaster, comparing two different fitting +engines and then we contrast this with version-aware forecasting. The former +will proceed by constructing an `epi_archive` that erases its version +information and then use `epix_slide()` to forecast the future. The latter will +keep the versioned data and proceed similarly by using `epix_slide()` to +forecast the future. + +## Version-unaware forecasting ### Example using CLI and case data from US states @@ -57,17 +51,20 @@ us_archive <- epix_merge( ) ``` -After obtaining the latest snapshot of the data, we produce forecasts on that -data using the default engine of simple linear regression and compare to a -random forest. - -Note that all of the warnings about the forecast date being less than the most -recent update date of the data have been suppressed to avoid cluttering the -output. +We then get latest snapshot of the data from the archive by using +`epix_as_of()`. We then create fake version information by setting `version = +time_value`. This creates an archive that pretends to have the latest data +available (since at version time `x` it has all the data up to time_value `x`, +which in reality is unrealistic because the time values of the data received at +version time `x` often lags by a few days, not to mention the later corrections +that are amended to the data). ```{r make-arx-kweek, warning = FALSE} # Latest snapshot of data, and forecast dates -us_latest <- epix_as_of(us_archive, max_version = max(us_archive$versions_end)) +us_latest <- us_archive %>% + epix_as_of(version = max(.$versions_end)) %>% + mutate(version = time_value) %>% + as_epi_archive() fc_time_values <- seq( from = as.Date("2020-08-01"), to = as.Date("2021-11-01"), @@ -75,34 +72,39 @@ fc_time_values <- seq( ) aheads <- c(7, 14, 21, 28) -k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { - epi_slide(epi_df, ~ arx_forecaster( - .x, outcome, predictors, engine, - args_list = arx_args_list(ahead = ahead) - )$predictions %>% - select(-geo_value), - before = 120L - 1L, - ref_time_values = fc_time_values, - new_col_name = "fc" - ) %>% - select(geo_value, time_value, starts_with("fc")) %>% - mutate(engine_type = engine$engine) +forecast_k_week_ahead <- function(epi_archive, outcome, predictors, ahead = 7, engine) { + epi_archive %>% + epix_slide( + .f = function(x, gk, rtv) { + arx_forecaster( + x, outcome, predictors, engine, + args_list = arx_args_list(ahead = ahead) + )$predictions %>% + mutate(engine_type = engine$engine) %>% + pivot_quantiles_wider(.pred_distn) + }, + .before = 120, + .versions = fc_time_values + ) } # Generate the forecasts and bind them together -fc <- bind_rows( - map(aheads, ~ k_week_ahead( - us_latest, "case_rate", c("case_rate", "percent_cli"), .x, +forecasts_version_unaware <- bind_rows( + map(aheads, ~ forecast_k_week_ahead( + us_latest, + outcome = "case_rate", + predictors = c("case_rate", "percent_cli"), + ahead = .x, engine = linear_reg() - )) %>% - list_rbind(), - map(aheads, ~ k_week_ahead( - us_latest, "case_rate", c("case_rate", "percent_cli"), .x, + )), + map(aheads, ~ forecast_k_week_ahead( + us_latest, + outcome = "case_rate", + predictors = c("case_rate", "percent_cli"), + ahead = .x, engine = rand_forest(mode = "regression") - )) %>% - list_rbind() -) %>% - pivot_quantiles_wider(contains("_distn")) + )) +) ``` Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the @@ -111,25 +113,28 @@ target (respecting time stamps and locations) along with lags of the features specified engine, creates predictions, and non-parametric confidence bands. To see how the predictions compare, we plot them on top of the latest case -rates. Note that even though we've fitted the model on all states, -we'll just display the -results for two states, California (CA) and Florida (FL), to get a sense of the -model performance while keeping the graphic simple. +rates. Note that even though we've fitted the model on all states, we'll just +display the results for two states, California (CA) and Florida (FL), to get a +sense of the model performance while keeping the graphic simple. ```{r plot-arx, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} #| code-fold: true -fc_cafl <- fc %>% filter(geo_value %in% c("ca", "fl")) -latest_cafl <- us_latest %>% filter(geo_value %in% c("ca", "fl")) - -ggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) + +forecasts_filtered <- forecasts_version_unaware %>% + tibble() %>% + filter(geo_value %in% c("ca", "fl")) +latest_data_filtered <- us_latest$DT %>% + tibble() %>% + filter(geo_value %in% c("ca", "fl")) + +ggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = engine_type)) + geom_line( - data = latest_cafl, aes(x = time_value, y = case_rate), + data = latest_data_filtered, aes(x = time_value, y = case_rate), inherit.aes = FALSE, color = "gray50" ) + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) + - geom_line(aes(y = fc_.pred)) + - geom_point(aes(y = fc_.pred), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + + geom_line(aes(y = .pred)) + + geom_point(aes(y = .pred), size = 0.5) + + geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + facet_grid(engine_type ~ geo_value, scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + scale_fill_brewer(palette = "Set1") + @@ -139,17 +144,18 @@ ggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) + ``` For the two states of interest, simple linear regression clearly performs better -than random forest in terms of accuracy of the predictions and does not -result in such in overconfident predictions (overly narrow confidence bands). -Though, in general, neither approach produces amazingly accurate forecasts. -This could be because -the behaviour is rather different across states and the effects of other notable -factors such as age and public health measures may be important to account for -in such forecasting. Including such factors as well as making enhancements such -as correcting for outliers are some improvements one could make to this simple -model.[^1] - -[^1]: Note that, despite the above caveats, simple models like this tend to out-perform many far more complicated models in the online Covid forecasting due to those models high variance predictions. +than random forest in terms of accuracy of the predictions and does not result +in such in overconfident predictions (overly narrow confidence bands). Though, +in general, neither approach produces amazingly accurate forecasts. This could +be because the behaviour is rather different across states and the effects of +other notable factors such as age and public health measures may be important to +account for in such forecasting. Including such factors as well as making +enhancements such as correcting for outliers are some improvements one could +make to this simple model.[^1] + +[^1]: Note that, despite the above caveats, simple models like this tend to +out-perform many far more complicated models in the online Covid forecasting due +to those models high variance predictions. ### Example using case data from Canada @@ -180,21 +186,19 @@ can <- can %>% mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>% as_epi_archive(compactify = TRUE) -can_latest <- epix_as_of(can, max_version = max(can$DT$version)) +can_latest <- epix_as_of(can, max_version = max(can$DT$version)) %>% + mutate(version = time_value) %>% + as_epi_archive() # Generate the forecasts, and bind them together can_fc <- bind_rows( - map(aheads, ~ k_week_ahead( - can_latest, "cr_7dav", "cr_7dav", .x, linear_reg() - )) %>% - list_rbind(), - map(aheads, ~ k_week_ahead( - can_latest, "cr_7dav", "cr_7dav", .x, - boost_tree(mode = "regression", trees = 20) - )) %>% - list_rbind() -) %>% - pivot_quantiles_wider(contains("_distn")) + map(aheads, + ~ forecast_k_week_ahead(can_latest, "cr_7dav", "cr_7dav", .x, linear_reg()) + ), + map(aheads, + ~ forecast_k_week_ahead(can_latest, "cr_7dav", "cr_7dav", .x, boost_tree(mode = "regression", trees = 20)) + ) +) ``` The first figure shows the results for all of the provinces using linear regression. @@ -203,19 +207,19 @@ The first figure shows the results for all of the provinces using linear regress #| code-fold: true ggplot( can_fc %>% filter(engine_type == "lm"), - aes(x = fc_target_date, group = time_value) + aes(x = target_date, group = forecast_date) ) + coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + geom_line( - data = can_latest, aes(x = time_value, y = cr_7dav), + data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav), inherit.aes = FALSE, color = "gray50" ) + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), alpha = 0.4 ) + - geom_line(aes(y = fc_.pred)) + - geom_point(aes(y = fc_.pred), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + + geom_line(aes(y = .pred)) + + geom_point(aes(y = .pred), size = 0.5) + + geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + facet_wrap(~geo_value, scales = "free_y", ncol = 3) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + scale_y_continuous(expand = expansion(c(0, 0.05))) + @@ -232,19 +236,19 @@ Compare those forecasts with a related set using Gradient Boosting. #| code-fold: true ggplot( can_fc %>% filter(engine_type == "xgboost"), - aes(x = fc_target_date, group = time_value) + aes(x = target_date, group = forecast_date) ) + coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + geom_line( - data = can_latest, aes(x = time_value, y = cr_7dav), + data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav), inherit.aes = FALSE, color = "gray50" ) + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), alpha = 0.4 ) + - geom_line(aes(y = fc_.pred)) + - geom_point(aes(y = fc_.pred), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + + geom_line(aes(y = .pred)) + + geom_point(aes(y = .pred), size = 0.5) + + geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + facet_wrap(~geo_value, scales = "free_y", ncol = 3) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + scale_y_continuous(expand = expansion(c(0, 0.05))) + @@ -262,72 +266,61 @@ sliding with different engines in `arx_forecaster`, we may devote another vignette to work on improving the predictive modelling using the suite of tools available in epipredict. -## Pseudoprospective vs. unfaithful retrospective forecasting +## Version-aware forecasting ### Example using case data from US states We will now run pseudoprospective forecasts based on properly-versioned data (that would have been available in real-time) to forecast future COVID-19 case -rates from current and past COVID-19 case rates for all states. That is, we can -make forecasts on the archive, `us_archive`, and compare those to forecasts on -(time windows of) the latest data, `us_latest`, using the same general set-up as -above. For pseudoprospective forecasting, note that `us_archive` is fed into -`epix_slide()`, while for simpler (unfaithful) retrospective forecasting, -`us_latest` is fed into `epi_slide()`. #%% update to include percent_cli after -that issue is fixed? +rates from current and past COVID-19 case rates for all states. All we have to +do is use the historical archive of the data with version information, +`us_archive`, instead of `us_latest` like we did above, in the argument to our +forecaster wrapper `forecast_k_week_ahead()`. Below we do that computation, tag +it, and combine it with the forecasts from one of the engines made above. ```{r make-ar-kweek-asof} -k_week_versioning <- function(ahead, version = c("faithful", "unfaithful")) { - version <- match.arg(version) - if (version == "faithful") { - epix_slide( - us_archive, - ~ arx_forecaster( - .x, "case_rate", c("case_rate", "percent_cli"), - args_list = arx_args_list(ahead = ahead) - )$predictions, - before = 120 - 1, - ref_time_values = fc_time_values, - new_col_name = "fc" - ) %>% - mutate(version = "version faithful") %>% - rename(geo_value = "fc_geo_value") - } else { - k_week_ahead( - us_latest, "case_rate", c("case_rate", "percent_cli"), - ahead, linear_reg() - ) %>% mutate(version = "not version faithful") - } -} - # Generate the forecasts, and bind them together -fc <- bind_rows( - map(aheads, ~ k_week_versioning(.x, "faithful")) %>% list_rbind(), - map(aheads, ~ k_week_versioning(.x, "unfaithful")) %>% list_rbind() -) %>% pivot_quantiles_wider(fc_.pred_distn) +forecasts_version_aware <- bind_rows( + map(aheads, ~ forecast_k_week_ahead( + us_archive, + outcome = "case_rate", + predictors = c("case_rate", "percent_cli"), + ahead = .x, + engine = linear_reg() + )) %>% + bind_rows() %>% + mutate(version = "version faithful"), + forecasts_version_unaware %>% filter(engine_type == "lm") %>% mutate(version = "version unfaithful") +) ``` -Now we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity. +Now we can plot the results on top of the latest case rates. As before, we will +only display and focus on the results for FL and CA for simplicity. ```{r plot-ar-asof, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} #| code-fold: true -fc_cafl <- fc %>% filter(geo_value %in% c("ca", "fl")) -latest_cafl <- us_latest %>% filter(geo_value %in% c("ca", "fl")) - -ggplot(fc_cafl, aes(x = fc_target_date, group = time_value)) + +forecasts_filtered <- forecasts_version_aware %>% + tibble() %>% + filter(geo_value %in% c("ca", "fl")) +latest_data_filtered <- us_latest$DT %>% + tibble() %>% + select(-version) %>% + filter(geo_value %in% c("ca", "fl")) + +ggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = version)) + geom_line( - data = latest_cafl, aes(x = time_value, y = case_rate), + data = latest_data_filtered, aes(x = time_value, y = case_rate), inherit.aes = FALSE, color = "gray50" ) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = version), alpha = 0.4) + - geom_line(aes(y = fc_.pred)) + - geom_point(aes(y = fc_.pred), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) + + geom_line(aes(y = .pred)) + + geom_point(aes(y = .pred), size = 0.5) + + geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + facet_grid(version ~ geo_value, scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + scale_fill_brewer(palette = "Set1") + scale_y_continuous(expand = expansion(c(0, 0.05))) + labs(x = "Date", y = "Reported COVID-19 case rates") + - scale_fill_brewer(palette = "Set1") + theme(legend.position = "none") ``` From 1a14295d22d57c961144c17855077d7169e5d880 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 2 Oct 2024 11:06:08 -0700 Subject: [PATCH 8/8] fix: slide plot --- .../execute-results/html.json | 4 +- .../figure-html/plot-ar-asof-1.svg | 278 +++++------------- sliding-forecasters.qmd | 27 +- 3 files changed, 91 insertions(+), 218 deletions(-) diff --git a/_freeze/sliding-forecasters/execute-results/html.json b/_freeze/sliding-forecasters/execute-results/html.json index 74fa0c2..9169e19 100644 --- a/_freeze/sliding-forecasters/execute-results/html.json +++ b/_freeze/sliding-forecasters/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "a74b5381d41b5132ec6860cad9e9e252", + "hash": "6068951c462b53dbb580b3d90e9fa2f1", "result": { - "markdown": "# Sliding version-unaware and version-aware ARX forecasters across dates\n\n\n::: {.cell}\n\n:::\n\n\nA key function from the epiprocess package is `epix_slide()` (refer to the\nfollowing vignette for the basics of the function: [\"Work with archive objects\nand data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html))\nwhich allows performing version-aware computations. That is, the function only\nuses data that would have been available as of time t for that reference time.\n\nIn this vignette, we use `epix_slide()` for backtesting our `arx_forecaster` on\nhistorical COVID-19 case data from the US and from Canada. We first examine the\nresults from a version-unaware forecaster, comparing two different fitting\nengines and then we contrast this with version-aware forecasting. The former\nwill proceed by constructing an `epi_archive` that erases its version\ninformation and then use `epix_slide()` to forecast the future. The latter will\nkeep the versioned data and proceed similarly by using `epix_slide()` to\nforecast the future.\n\n## Version-unaware forecasting\n\n### Example using CLI and case data from US states\n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_89a9d4079f8ffc6080f83369668b2316'}\n\n```{.r .cell-code}\nus_raw_history_dfs <- readRDS(url(\n \"https://github.com/cmu-delphi/epipredict/raw/dev/vignettes/articles/all_states_covidcast_signals.rds\"\n))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nWe then get latest snapshot of the data from the archive by using\n`epix_as_of()`. We then create fake version information by setting `version =\ntime_value`. This creates an archive that pretends to have the latest data\navailable (since at version time `x` it has all the data up to time_value `x`,\nwhich in reality is unrealistic because the time values of the data received at\nversion time `x` often lags by a few days, not to mention the later corrections\nthat are amended to the data).\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_6ff04c287f0d7a0f9d4503649e56bd3a'}\n\n```{.r .cell-code}\n# Latest snapshot of data, and forecast dates\nus_latest <- us_archive %>%\n epix_as_of(version = max(.$versions_end)) %>%\n mutate(version = time_value) %>%\n as_epi_archive()\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nforecast_k_week_ahead <- function(epi_archive, outcome, predictors, ahead = 7, engine) {\n epi_archive %>%\n epix_slide(\n .f = function(x, gk, rtv) {\n arx_forecaster(\n x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n mutate(engine_type = engine$engine) %>%\n pivot_quantiles_wider(.pred_distn)\n },\n .before = 120,\n .versions = fc_time_values\n )\n}\n\n# Generate the forecasts and bind them together\nforecasts_version_unaware <- bind_rows(\n map(aheads, ~ forecast_k_week_ahead(\n us_latest,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = linear_reg()\n )),\n map(aheads, ~ forecast_k_week_ahead(\n us_latest,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = rand_forest(mode = \"regression\")\n ))\n)\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands.\n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states, we'll just\ndisplay the results for two states, California (CA) and Florida (FL), to get a\nsense of the model performance while keeping the graphic simple.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_2918e9947b5ecabca1115f6cc2d8eb62'}\n\n```{.r .cell-code code-fold=\"true\"}\nforecasts_filtered <- forecasts_version_unaware %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_data_filtered <- us_latest$DT %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = engine_type)) +\n geom_line(\n data = latest_data_filtered, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not result\nin such in overconfident predictions (overly narrow confidence bands). Though,\nin general, neither approach produces amazingly accurate forecasts. This could\nbe because the behaviour is rather different across states and the effects of\nother notable factors such as age and public health measures may be important to\naccount for in such forecasting. Including such factors as well as making\nenhancements such as correcting for outliers are some improvements one could\nmake to this simple model.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to\nout-perform many far more complicated models in the online Covid forecasting due\nto those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_2457088f4bfc3bada5f7c38814504be7'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- epidatasets::can_prov_cases\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version)) %>%\n mutate(version = time_value) %>%\n as_epi_archive()\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(\n aheads,\n ~ forecast_k_week_ahead(can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg())\n ),\n map(\n aheads,\n ~ forecast_k_week_ahead(can_latest, \"cr_7dav\", \"cr_7dav\", .x, boost_tree(mode = \"regression\", trees = 20))\n )\n)\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_f7e7878c3f1a72f4cb9216d68aa63292'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = target_date, group = forecast_date)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_58502edbabde6914bca10e407c6f445f'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = target_date, group = forecast_date)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Version-aware forecasting\n\n### Example using case data from US states\n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. All we have to\ndo is use the historical archive of the data with version information,\n`us_archive`, instead of `us_latest` like we did above, in the argument to our\nforecaster wrapper `forecast_k_week_ahead()`. Below we do that computation, tag\nit, and combine it with the forecasts from one of the engines made above.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-ar-kweek-asof_4b8247ffc26ffb4f84a5646852522f3b'}\n\n```{.r .cell-code}\n# Generate the forecasts, and bind them together\nforecasts_version_aware <- bind_rows(\n map(aheads, ~ forecast_k_week_ahead(\n us_archive,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = linear_reg()\n )) %>%\n bind_rows() %>%\n mutate(version = \"version faithful\"),\n forecasts_version_unaware %>% filter(engine_type == \"lm\") %>% mutate(version = \"version unfaithful\")\n)\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will\nonly display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_884abfb3ac2d5a8dfe9bc404ff34c5d4'}\n\n```{.r .cell-code code-fold=\"true\"}\nforecasts_filtered <- forecasts_version_aware %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_data_filtered <- us_latest$DT %>%\n tibble() %>%\n select(-version) %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = version)) +\n geom_line(\n data = latest_data_filtered, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", + "markdown": "# Sliding version-unaware and version-aware ARX forecasters across dates\n\n\n::: {.cell}\n\n:::\n\n\nA key function from the epiprocess package is `epix_slide()` (refer to the\nfollowing vignette for the basics of the function: [\"Work with archive objects\nand data\nrevisions\"](https://cmu-delphi.github.io/epiprocess/articles/archive.html))\nwhich allows performing version-aware computations. That is, the function only\nuses data that would have been available as of time t for that reference time.\n\nIn this vignette, we use `epix_slide()` for backtesting our `arx_forecaster` on\nhistorical COVID-19 case data from the US and from Canada. We first examine the\nresults from a version-unaware forecaster, comparing two different fitting\nengines and then we contrast this with version-aware forecasting. The former\nwill proceed by constructing an `epi_archive` that erases its version\ninformation and then use `epix_slide()` to forecast the future. The latter will\nkeep the versioned data and proceed similarly by using `epix_slide()` to\nforecast the future.\n\n## Version-unaware forecasting\n\n### Example using CLI and case data from US states\n\nFirst, we download the version history (i.e. archive) of the percentage of\ndoctor’s visits with CLI (COVID-like illness) computed from medical insurance\nclaims and the number of new confirmed COVID-19 cases per 100,000 population\n(daily) for all 50 states from the COVIDcast API. We process as before, with the\nmodification that we use `sync = \"locf\"` in `epix_merge()` so that the last\nversion of each observation can be carried forward to extrapolate unavailable\nversions for the less up-to-date input archive.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/grab-epi-data_89a9d4079f8ffc6080f83369668b2316'}\n\n```{.r .cell-code}\nus_raw_history_dfs <- readRDS(url(\n \"https://github.com/cmu-delphi/epipredict/raw/dev/vignettes/articles/all_states_covidcast_signals.rds\"\n))\n\nus_cli_archive <- us_raw_history_dfs[[1]] %>%\n select(geo_value, time_value, version = issue, percent_cli = value) %>%\n as_epi_archive(compactify = TRUE)\nus_cases_archive <- us_raw_history_dfs[[2]] %>%\n select(geo_value, time_value, version = issue, case_rate = value) %>%\n as_epi_archive(compactify = TRUE)\n\nus_archive <- epix_merge(\n us_cli_archive, us_cases_archive,\n sync = \"locf\", compactify = TRUE\n)\n```\n:::\n\n\nWe then get latest snapshot of the data from the archive by using\n`epix_as_of()`. We then create fake version information by setting `version =\ntime_value`. This creates an archive that pretends to have the latest data\navailable (since at version time `x` it has all the data up to time_value `x`,\nwhich in reality is unrealistic because the time values of the data received at\nversion time `x` often lags by a few days, not to mention the later corrections\nthat are amended to the data).\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-arx-kweek_c224b23a96f435b4b2bdec18ce48389c'}\n\n```{.r .cell-code}\n# Get latest snapshot of data and pretend it's an archive\nus_latest <- us_archive %>%\n epix_as_of(version = max(.$versions_end)) %>%\n mutate(version = time_value) %>%\n as_epi_archive()\nfc_time_values <- seq(\n from = as.Date(\"2020-08-01\"),\n to = as.Date(\"2021-11-01\"),\n by = \"1 month\"\n)\naheads <- c(7, 14, 21, 28)\n\nforecast_k_week_ahead <- function(epi_archive, outcome, predictors, ahead = 7, engine) {\n epi_archive %>%\n epix_slide(\n .f = function(x, gk, rtv) {\n arx_forecaster(\n x, outcome, predictors, engine,\n args_list = arx_args_list(ahead = ahead)\n )$predictions %>%\n mutate(engine_type = engine$engine) %>%\n pivot_quantiles_wider(.pred_distn)\n },\n .before = 120,\n .versions = fc_time_values\n )\n}\n\n# Generate the forecasts and bind them together\nforecasts_version_unaware <- bind_rows(\n map(aheads, ~ forecast_k_week_ahead(\n us_latest,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = linear_reg()\n )),\n map(aheads, ~ forecast_k_week_ahead(\n us_latest,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = rand_forest(mode = \"regression\")\n ))\n)\n```\n:::\n\n\nHere, `arx_forecaster()` does all the heavy lifting. It creates leads of the\ntarget (respecting time stamps and locations) along with lags of the features\n(here, the response and doctors visits), estimates a forecasting model using the\nspecified engine, creates predictions, and non-parametric confidence bands.\n\nTo see how the predictions compare, we plot them on top of the latest case\nrates. Note that even though we've fitted the model on all states, we'll just\ndisplay the results for two states, California (CA) and Florida (FL), to get a\nsense of the model performance while keeping the graphic simple.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-arx_2918e9947b5ecabca1115f6cc2d8eb62'}\n\n```{.r .cell-code code-fold=\"true\"}\nforecasts_filtered <- forecasts_version_unaware %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_data_filtered <- us_latest$DT %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = engine_type)) +\n geom_line(\n data = latest_data_filtered, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_grid(engine_type ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-arx-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nFor the two states of interest, simple linear regression clearly performs better\nthan random forest in terms of accuracy of the predictions and does not result\nin such in overconfident predictions (overly narrow confidence bands). Though,\nin general, neither approach produces amazingly accurate forecasts. This could\nbe because the behaviour is rather different across states and the effects of\nother notable factors such as age and public health measures may be important to\naccount for in such forecasting. Including such factors as well as making\nenhancements such as correcting for outliers are some improvements one could\nmake to this simple model.[^1]\n\n[^1]: Note that, despite the above caveats, simple models like this tend to\nout-perform many far more complicated models in the online Covid forecasting due\nto those models high variance predictions.\n\n### Example using case data from Canada\n\nBy leveraging the flexibility of `epiprocess`, we can apply the same techniques\nto data from other sources. Since some collaborators are in British Columbia,\nCanada, we'll do essentially the same thing for Canada as we did above.\n\nThe [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects\ndaily time series data on COVID-19 cases, deaths, recoveries, testing and\nvaccinations at the health region and province levels. Data are collected from\npublicly available sources such as government datasets and news releases.\nUnfortunately, there is no simple versioned source, so we have created our own\nfrom the Github commit history.\n\nFirst, we load versioned case rates at the provincial level. After converting\nthese to 7-day averages (due to highly variable provincial reporting\nmismatches), we then convert the data to an `epi_archive` object, and extract\nthe latest version from it. Finally, we run the same forcasting exercise as for\nthe American data, but here we compare the forecasts produced from using simple\nlinear regression with those from using boosted regression trees.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/get-can-fc_2457088f4bfc3bada5f7c38814504be7'}\n\n```{.r .cell-code}\n# source(\"drafts/canada-case-rates.R)\ncan <- epidatasets::can_prov_cases\ncan <- can %>%\n group_by(version, geo_value) %>%\n arrange(time_value) %>%\n mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>%\n as_epi_archive(compactify = TRUE)\n\ncan_latest <- epix_as_of(can, max_version = max(can$DT$version)) %>%\n mutate(version = time_value) %>%\n as_epi_archive()\n\n# Generate the forecasts, and bind them together\ncan_fc <- bind_rows(\n map(\n aheads,\n ~ forecast_k_week_ahead(can_latest, \"cr_7dav\", \"cr_7dav\", .x, linear_reg())\n ),\n map(\n aheads,\n ~ forecast_k_week_ahead(can_latest, \"cr_7dav\", \"cr_7dav\", .x, boost_tree(mode = \"regression\", trees = 20))\n )\n)\n```\n:::\n\n\nThe first figure shows the results for all of the provinces using linear regression.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-lr_f7e7878c3f1a72f4cb9216d68aa63292'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"lm\"),\n aes(x = target_date, group = forecast_date)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using simple linear regression\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-lr-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nCompare those forecasts with a related set using Gradient Boosting.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-can-fc-boost_58502edbabde6914bca10e407c6f445f'}\n\n```{.r .cell-code code-fold=\"true\"}\nggplot(\n can_fc %>% filter(engine_type == \"xgboost\"),\n aes(x = target_date, group = forecast_date)\n) +\n coord_cartesian(xlim = lubridate::ymd(c(\"2020-12-01\", NA))) +\n geom_line(\n data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value),\n alpha = 0.4\n ) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_wrap(~geo_value, scales = \"free_y\", ncol = 3) +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(\n title = \"Using boosted regression trees\", x = \"Date\",\n y = \"Reported COVID-19 case rates\"\n ) +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-can-fc-boost-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nBoth approaches tend to produce quite volatile forecasts (point predictions)\nand/or are overly confident (very narrow bands), particularly when boosted\nregression trees are used. But as this is meant to be a simple demonstration of\nsliding with different engines in `arx_forecaster`, we may devote another\nvignette to work on improving the predictive modelling using the suite of tools\navailable in epipredict.\n\n## Version-aware forecasting\n\n### Example using case data from US states\n\nWe will now run pseudoprospective forecasts based on properly-versioned data\n(that would have been available in real-time) to forecast future COVID-19 case\nrates from current and past COVID-19 case rates for all states. All we have to\ndo is use the historical archive of the data with version information,\n`us_archive`, instead of `us_latest` like we did above, in the argument to our\nforecaster wrapper `forecast_k_week_ahead()`. Below we do that computation, tag\nit, and combine it with the forecasts from one of the engines made above.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/make-faithful-forecast_c688db7d4caffffd6af45a48d0cfd7bc'}\n\n```{.r .cell-code}\nforecasts_version_aware <- map(aheads, ~ forecast_k_week_ahead(\n us_archive,\n outcome = \"case_rate\",\n predictors = c(\"case_rate\", \"percent_cli\"),\n ahead = .x,\n engine = linear_reg()\n)) %>%\n bind_rows() %>%\n mutate(version = \"version faithful\")\n```\n:::\n\n\nNow we can plot the results on top of the latest case rates. As before, we will\nonly display and focus on the results for FL and CA for simplicity.\n\n\n::: {.cell layout-align=\"center\" hash='sliding-forecasters_cache/html/plot-ar-asof_9cbc2463d0ba3b648ac129e2d8b9ccbd'}\n\n```{.r .cell-code code-fold=\"true\"}\nforecasts_filtered <- bind_rows(\n forecasts_version_aware,\n forecasts_version_unaware %>%\n filter(engine_type == \"lm\") %>%\n mutate(version = \"version unfaithful\")\n) %>%\n tibble() %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\nlatest_data_filtered <- us_latest$DT %>%\n tibble() %>%\n select(-version) %>%\n filter(geo_value %in% c(\"ca\", \"fl\"))\n\nggplot(forecasts_filtered, aes(x = target_date, group = forecast_date, fill = version)) +\n geom_line(\n data = latest_data_filtered, aes(x = time_value, y = case_rate),\n inherit.aes = FALSE, color = \"gray50\"\n ) +\n geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) +\n geom_line(aes(y = .pred)) +\n geom_point(aes(y = .pred), size = 0.5) +\n geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) +\n facet_grid(version ~ geo_value, scales = \"free\") +\n scale_x_date(minor_breaks = \"month\", date_labels = \"%b %y\") +\n scale_fill_brewer(palette = \"Set1\") +\n scale_y_continuous(expand = expansion(c(0, 0.05))) +\n labs(x = \"Date\", y = \"Reported COVID-19 case rates\") +\n theme(legend.position = \"none\")\n```\n\n::: {.cell-output-display}\n![](sliding-forecasters_files/figure-html/plot-ar-asof-1.svg){fig-align='center' width=90%}\n:::\n:::\n\n\nAgain, we observe that the results are not great for these two states, but\nthat's likely due to the simplicity of the model (ex. the omission of key\nfactors such as age and public health measures) and the quality of the data (ex.\nwe have not personally corrected for anomalies in the data).\n\nWe shall leave it to the reader to try the above version aware and unaware\nforecasting exercise on the Canadian case rate data. The above code for the\nAmerican state data should be readily adaptable for this purpose.\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg index c21dcfc..b0ec475 100644 --- a/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg +++ b/_freeze/sliding-forecasters/figure-html/plot-ar-asof-1.svg @@ -409,7 +409,7 @@ - + @@ -427,7 +427,7 @@ - + @@ -727,7 +727,7 @@ - + @@ -742,19 +742,19 @@ - + - + - + - + - + @@ -835,7 +835,7 @@ - + @@ -1172,54 +1172,54 @@ - + - + - - - - + + + + - + - + - + - + - + - + - - + + - + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + @@ -1284,70 +1284,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1733,58 +1669,58 @@ - + - + - + - - - - + + + + - + - - + + - + - + - + - + - + - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - + - + - + @@ -1861,70 +1797,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/sliding-forecasters.qmd b/sliding-forecasters.qmd index 65d8802..f486ce7 100644 --- a/sliding-forecasters.qmd +++ b/sliding-forecasters.qmd @@ -60,7 +60,7 @@ version time `x` often lags by a few days, not to mention the later corrections that are amended to the data). ```{r make-arx-kweek, warning = FALSE} -# Latest snapshot of data, and forecast dates +# Get latest snapshot of data and pretend it's an archive us_latest <- us_archive %>% epix_as_of(version = max(.$versions_end)) %>% mutate(version = time_value) %>% @@ -79,13 +79,13 @@ forecast_k_week_ahead <- function(epi_archive, outcome, predictors, ahead = 7, e arx_forecaster( x, outcome, predictors, engine, args_list = arx_args_list(ahead = ahead) - )$predictions %>% - mutate(engine_type = engine$engine) %>% - pivot_quantiles_wider(.pred_distn) + )$predictions }, .before = 120, .versions = fc_time_values - ) + ) %>% + mutate(engine_type = engine$engine) %>% + pivot_quantiles_wider(.pred_distn) } # Generate the forecasts and bind them together @@ -278,10 +278,8 @@ do is use the historical archive of the data with version information, forecaster wrapper `forecast_k_week_ahead()`. Below we do that computation, tag it, and combine it with the forecasts from one of the engines made above. -```{r make-ar-kweek-asof} -# Generate the forecasts, and bind them together -forecasts_version_aware <- bind_rows( - map(aheads, ~ forecast_k_week_ahead( +```{r make-faithful-forecast} +forecasts_version_aware <- map(aheads, ~ forecast_k_week_ahead( us_archive, outcome = "case_rate", predictors = c("case_rate", "percent_cli"), @@ -289,9 +287,7 @@ forecasts_version_aware <- bind_rows( engine = linear_reg() )) %>% bind_rows() %>% - mutate(version = "version faithful"), - forecasts_version_unaware %>% filter(engine_type == "lm") %>% mutate(version = "version unfaithful") -) + mutate(version = "version faithful") ``` Now we can plot the results on top of the latest case rates. As before, we will @@ -299,7 +295,12 @@ only display and focus on the results for FL and CA for simplicity. ```{r plot-ar-asof, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} #| code-fold: true -forecasts_filtered <- forecasts_version_aware %>% +forecasts_filtered <- bind_rows( + forecasts_version_aware, + forecasts_version_unaware %>% + filter(engine_type == "lm") %>% + mutate(version = "version unfaithful") + ) %>% tibble() %>% filter(geo_value %in% c("ca", "fl")) latest_data_filtered <- us_latest$DT %>%