* standalone-helpers updated

andrewallenbruce · Dec 17, 2024 · 787fa93 · 787fa93
1 parent 69def8d
commit 787fa93
Show file tree

Hide file tree

Showing 58 changed files with 1,430 additions and 87 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -27,7 +27,6 @@ Suggests:
     clock,
     dplyr,
     fipio,
-    fixtuRes,
     fontawesome,
     forcats,
     fs (>= 1.6.5),
@@ -54,7 +53,6 @@ Suggests:
     tidyr,
     timeplyr,
     usethis,
-    wakefield,
     zipcodeR
 Config/roxyglobals/filename: generated-globals.R
 Config/roxyglobals/unique: TRUE

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,10 @@ export("%or%")
 export(add_counties)
 export(age_days)
 export(arrows)
+export(as_chr)
+export(as_date)
+export(as_int)
+export(as_num)
 export(bracks)
 export(change)
 export(change_lagged)
@@ -29,18 +33,25 @@ export(count_prop_multi)
 export(count_wide)
 export(create_vec)
 export(data_types)
+export(delist)
 export(delister)
 export(density)
 export(describe)
+export(describe2)
 export(describe_unique)
+export(desplit)
 export(display_long)
 export(duration_vec)
+export(empty)
 export(expand_date_range)
+export(false)
 export(fancy_ts)
 export(find_common_order)
 export(gchop)
+export(gelm)
 export(geomean)
 export(get_pin)
+export(getelem)
 export(gg_theme)
 export(gh_raw)
 export(glue_chr)
@@ -49,35 +60,68 @@ export(gluestick)
 export(gt_marks)
 export(histo)
 export(histogram)
+export(if_empty_null)
+export(iif_else)
 export(initialize_package)
 export(interpolate)
+export(invert_named)
 export(is_valid_npi)
 export(is_valid_npi2)
 export(list_pins)
 export(make_interval)
+export(max_vlen)
 export(mock_forager)
 export(mock_provider)
 export(mount_board)
+export(na)
+export(na_if)
 export(na_if_common)
 export(named_group_split)
 export(new_value)
+export(not_na)
+export(not_null)
+export(null)
 export(pad_number)
 export(parens)
 export(percentage_calculator)
 export(percentage_change)
 export(percentage_difference)
 export(print_ls)
 export(random_npi_generator)
+export(random_string)
 export(rate_of_return)
+export(remove_all_na)
 export(remove_quiet)
+export(remove_quotes)
 export(rename_seq)
+export(roundup)
 export(search_for)
+export(search_in)
+export(sf_at)
+export(sf_c)
+export(sf_chars)
+export(sf_conv)
+export(sf_detect)
+export(sf_extract)
+export(sf_ndetect)
+export(sf_nextract)
+export(sf_remove)
+export(sf_replace)
+export(sf_smush)
+export(sf_strsplit)
+export(sf_sub)
 export(single_line_string)
 export(sorted_bars)
 export(splitter)
+export(strsort)
 export(summary_stats)
+export(true)
 export(ttimestamp)
+export(uniq)
+export(uniq_narm)
+export(uniq_vlen)
 export(update_personal_packages)
+export(vlen)
 export(years_df)
 export(years_floor)
 export(years_vec)

diff --git a/R/describe.R b/R/describe.R
@@ -1,3 +1,75 @@
+#' Describe 2
+#'
+#' @param df `<data.frame>` desc
+#'
+#' @param ... `<dots>` tidyselect columns
+#'
+#' @returns `<tibble>` of summary statistics
+#'
+#' @examples
+#' describe2(mock_provider(2000:2020))
+#'
+#' describe2(mock_forager(200))
+#'
+#' @autoglobal
+#'
+#' @export
+describe2 <- function(df, ...) {
+
+  get_type <- \(x) {
+    cheapr::enframe_(
+      purrr::map_vec(x, function(x)
+          glue_chr("<{pillar::type_sum(x)}>")),
+      name = "column",
+      value = "type")
+  }
+
+  fiqr <- \(x) diff(collapse::.quantile(as.numeric(x), c(0.25, 0.75)))
+
+  if (nargs() > 1) df <- dplyr::select(df, ...)
+
+  dates <- dplyr::select(df, dplyr::where(\(x) inherits(x, "Date")))
+  df    <- dplyr::select(df, dplyr::where(\(x) !inherits(x, "Date")))
+
+  sums <- df |>
+    dplyr::mutate(
+      dplyr::across(dplyr::where(is.character), stringr::str_length),
+      dplyr::across(dplyr::where(\(x) is.factor(x) | is.logical(x)), as.numeric)) |>
+    tidyr::pivot_longer(dplyr::everything(), names_to = "column") |>
+    dplyr::mutate(n = 1 - cheapr::is_na(value)) |>
+    dplyr::reframe(
+      n = collapse::fsum(value, nthreads = 4L),
+      min = collapse::fmin(value),
+      mean = collapse::fmean(value, nthreads = 4L),
+      iqr = fiqr(value),
+      max = collapse::fmax(value),
+      med = collapse::fmedian(value),
+      sd = collapse::fsd(value),
+      mad = mad(value, na.rm = TRUE),
+      distribution = histo(value),
+      .by = column) |>
+    dplyr::left_join(get_type(df), by = dplyr::join_by(column))
+
+  topn <- \(x, limit = 10) {
+    dplyr::tibble(
+      column = names(x),
+      uniq = collapse::fnunique(collapse::na_rm(x)),
+      top = collapse::fcount(collapse::na_rm(x), name = "n") |>
+        dplyr::arrange(dplyr::desc(n)) |>
+        dplyr::slice(seq(1, limit)) |>
+        dplyr::pull(x) |>
+        stringr::str_flatten_comma())
+  }
+
+  tops <- purrr::map(df, topn) |>
+    purrr::list_rbind(names_to = "column") |>
+    dplyr::filter(uniq != nrow(df))
+
+  dplyr::left_join(sums, tops, by = dplyr::join_by(column)) |>
+    dplyr::arrange(dplyr::desc(type)) |>
+    dplyr::select(column, type, n, min, mean, med, max, iqr, sd, mad, distribution, uniq, top)
+}
+
 #' Describe a dataset
 #'
 #' @param df `<data.frame>` desc
@@ -140,9 +212,13 @@ histo <- function(x, width = 10) {
 #'
 #' @examples
 #'
-#' describe_unique(mock_forager(), ins_class, payer)
+#' describe_unique(mock_forager(), class, payer)
 #'
-#' # describe_unique(mock_forager(200), names(df)[2:3])
+#' describe_unique(
+#'   mock_forager(50),
+#'   names(mock_forager())[c(2:3, 6:9)]) |>
+#'   dplyr::filter(n > 2) |>
+#'   print(n = 30)
 #'
 #' @autoglobal
 #'
@@ -157,7 +233,9 @@ describe_unique <- function(df,
 
   df <- dplyr::select(df, ...)
 
-  .set_names <- if (is.null(.set_names)) names(df) else .set_names
+  .set_names <- if (null(.set_names)) names(df) else .set_names
+
+  df <- dplyr::mutate(df, dplyr::across(!dplyr::where(is.character), as.character))
 
   df <- columns_to_character(df) |>
     names() |>

diff --git a/R/generated-globals.R b/R/generated-globals.R
@@ -9,6 +9,8 @@ utils::globalVariables(c(
   # <rate_of_return>
   # <combine>
   ":=",
+  # <describe2>
+  "column",
   # <combine>
   "combined",
   # <rate_of_return>
@@ -23,35 +25,46 @@ utils::globalVariables(c(
   "date_of_service",
   # <mock_forager>
   "date_of_submission",
+  # <describe2>
+  "distribution",
   # <id_runs>
   "group",
   # <id_runs>
   "group_size",
   # <make_interval>
   "interval",
+  # <describe2>
   # <describe>
   "iqr",
   # <id_runs>
   "key",
   # <rate_of_return>
   "lg",
+  # <describe2>
   # <describe>
   "mad",
+  # <describe2>
   # <describe>
   "med",
+  # <describe2>
   # <describe>
   # <count_prop>
   # <count_prop_multi>
   # <count_wide>
   "n",
   # <describe>
   "nuniq",
+  # <describe2>
   # <describe>
   "sd",
+  # <describe2>
+  "top",
   # <describe>
   "top_n",
+  # <describe2>
   # <describe>
   "type",
+  # <describe2>
   # <describe>
   # <id_runs>
   "value",

diff --git a/R/mock.R b/R/mock.R
@@ -29,7 +29,7 @@ mock_provider <- \(years) {
 #'
 #' @param rows `<int>` number of rows to generate; default is 10
 #'
-#' @param nest `<lgl>` whether to nest the dates column; default is `TRUE`
+#' @param nest `<lgl>` whether to nest the dates column; default is `FALSE`
 #'
 #' @returns A [tibble][tibble::tibble-package]
 #'
@@ -41,23 +41,47 @@ mock_provider <- \(years) {
 #' @family mock
 #'
 #' @export
-mock_forager <- function(rows = 10, nest = TRUE){
+mock_forager <- function(rows = 10, nest = FALSE){
 
-  payer_names <- c("Medicare", "Medicaid", "Cigna", "Humana", "UHC", "Anthem", "BCBS", "Centene")
+  payers <- sample(
+    x = factor(
+      x = c("Medicare",
+            "Medicaid",
+            "Cigna",
+            "Humana",
+            "UHC",
+            "Anthem",
+            "BCBS",
+            "Centene",
+            "MAO")),
+    size = rows,
+    replace = TRUE)
+
+  classes <- sample(
+    x = ordered(
+      c("Primary",
+        "Secondary")),
+    size = rows,
+    replace = TRUE)
 
   x <- dplyr::tibble(
-    claim_id             = as.character(wakefield::id(n = rows)),
-    date_of_service      = wakefield::dob(n = rows, start = Sys.Date() - 730, random = TRUE, k = 12, by = "-1 months"),
-    payer                = fixtuRes::set_vector(rows, set = payer_names),
-    ins_class            = fixtuRes::set_vector(rows, set = c("Primary", "Secondary")),
-    balance              = as.double(wakefield::income(n = rows, digits = 2) / 300),
-    date_of_release      = date_of_service + round(abs(stats::rnorm(length(date_of_service), 11, 4))),
-    date_of_submission   = date_of_release + round(abs(stats::rnorm(length(date_of_release), 2, 2))),
-    date_of_acceptance   = date_of_submission + round(abs(stats::rnorm(length(date_of_submission), 3, 2))),
-    date_of_adjudication = date_of_acceptance + round(abs(stats::rnorm(length(date_of_acceptance), 30, 3))))
+    id                   = sprintf(paste0("%0", nchar(rows) + 3, "d"), seq_len(rows)),
+    payer                = payers,
+    class                = classes,
+    balance              = roundup(stats::rgamma(n = rows, 2) * 20000) / 300,
+    date_of_service      = sample(x = seq.Date(from = Sys.Date(), by = "-1 months", length.out = 12), size = rows, replace = TRUE),
+    date_of_release      = date_of_service + roundup(abs(stats::rnorm(rows, 11, 4))),
+    date_of_submission   = date_of_release + roundup(abs(stats::rnorm(rows, 2, 2))),
+    date_of_acceptance   = date_of_submission + roundup(abs(stats::rnorm(rows, 3, 2))),
+    date_of_adjudication = date_of_acceptance + roundup(abs(stats::rnorm(rows, 30, 3)))
+    )
 
   if (nest)
-    return(tidyr::nest(x, dates = tidyr::contains("date")))
-
+    return(
+      tidyr::nest(
+        x,
+        dates = tidyr::contains("date")
+        )
+      )
   x
 }