From a2f2e0bce851a0204a5686093160ebc71eb3332f Mon Sep 17 00:00:00 2001 From: Andrew Bruce Date: Tue, 30 Jul 2024 14:50:29 -0700 Subject: [PATCH] * `describe()` first draft --- DESCRIPTION | 2 ++ NAMESPACE | 1 + R/describe.R | 80 +++++++++++++++++++++++++++++++++++++++++++ R/fake_data.R | 13 ++++--- R/generated-globals.R | 8 +++++ man/describe.Rd | 28 +++++++++++++++ 6 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 R/describe.R create mode 100644 man/describe.Rd diff --git a/DESCRIPTION b/DESCRIPTION index eeb48a0..ecc1a70 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,6 +18,8 @@ Depends: Imports: clock, collapse, + cheapr, + pillar, dplyr, forcats, fs, diff --git a/NAMESPACE b/NAMESPACE index bbc184e..765465c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ export(count_wide) export(create_vec) export(delister) export(density) +export(describe) export(df_2_chr) export(display_long) export(duration_vec) diff --git a/R/describe.R b/R/describe.R new file mode 100644 index 0000000..cd8b8f5 --- /dev/null +++ b/R/describe.R @@ -0,0 +1,80 @@ +#' Describe a dataset +#' +#' @param df `` desc +#' +#' @param ... `` tidyselect columns +#' +#' @returns `` of summary statistics +#' +#' @examples +#' describe(fuimus:::provider_data(2000:2020)) +#' +#' describe( +#' fuimus:::forager_data(200), +#' !dplyr::starts_with("date") +#' ) +#' +#' @autoglobal +#' +#' @export +describe <- function(df, ...) { + + if (nargs() > 1) df <- dplyr::select(df, ...) + + df_sums <- df |> + dplyr::mutate_if(is.character, stringr::str_length) |> + dplyr::mutate_if(is.factor, as.numeric) |> + dplyr::mutate_if(is.logical, as.numeric) |> + tidyr::pivot_longer( + cols = dplyr::everything(), + names_to = "variable" + ) |> + dplyr::mutate(n = 1 - is.na(value)) |> + dplyr::reframe( + n = as.integer(base::sum(n)), + amean = base::mean(value, na.rm = TRUE), + gmean = fuimus::geomean(value), + sd = stats::sd(value, na.rm = TRUE), + iqr = stats::IQR(value, na.rm = TRUE), + median = stats::median(value, na.rm = TRUE), + mad = stats::mad(value, na.rm = TRUE), + range = as.character(stringr::str_glue( + "[", + "{base::min(value, na.rm = TRUE)}", + " - ", + "{as.integer(base::max(value, na.rm = TRUE))}", + "]" + ) + ), + hist = cheapr:::inline_hist(value), + .by = variable + ) + + get_type <- \(x) dplyr::tibble( + variable = names(x), + type = stringr::str_c("<", pillar::type_sum(x), ">") |> + forcats::as_factor() + ) + + df_types <- purrr::map(df, get_type) |> + purrr::list_rbind(names_to = "variable") + + get_unique <- \(x, limit = 5) dplyr::tibble( + variable = names(x), + n_uniq = collapse::fnunique(collapse::na_rm(x)), + top_5 = collapse::fcount(collapse::na_rm(x), name = "n") |> + dplyr::arrange(dplyr::desc(n)) |> + dplyr::slice(1:limit) |> + dplyr::pull(x) |> + stringr::str_flatten_comma() + ) + + df_unique <- purrr::map(df, get_unique) |> + purrr::list_rbind(names_to = "variable") + + joinby <- dplyr::join_by(variable) + + dplyr::left_join(df_types, df_sums, by = joinby) |> + dplyr::left_join(df_unique, by = joinby) |> + dplyr::arrange(dplyr::desc(type)) +} diff --git a/R/fake_data.R b/R/fake_data.R index 773eb2b..f0f8757 100644 --- a/R/fake_data.R +++ b/R/fake_data.R @@ -33,6 +33,8 @@ provider_data <- function(year_seq) { #' #' @param rows number of rows to generate; default is 10 #' +#' @param unnest a logical indicating whether to unnest the dates column; default is `FALSE` +#' #' @returns A [tibble][tibble::tibble-package] #' #' @examplesIf interactive() @@ -41,17 +43,17 @@ provider_data <- function(year_seq) { #' @autoglobal #' #' @noRd -forager_data <- function(rows = 10){ +forager_data <- function(rows = 10, unnest = FALSE){ - dplyr::tibble( - claim_id = wakefield::id(n = rows), + x <- dplyr::tibble( + claim_id = as.character(wakefield::id(n = rows)), date_of_service = wakefield::date_stamp(n = rows, start = lubridate::today() - lubridate::dyears(2), random = TRUE), payer = fixtuRes::set_vector(rows, set = c("Medicare", "Medicaid", "Cigna", "Humana", "UnitedHealth", "Anthem", "BCBS", "Centene")), ins_class = fixtuRes::set_vector(rows, set = c("Primary", "Secondary")), - balance = wakefield::income(n = rows, digits = 2) / 300) |> + balance = as.double(wakefield::income(n = rows, digits = 2) / 300)) |> dplyr::mutate( date_of_service = lubridate::as_date(date_of_service), date_of_release = date_of_service + round(abs(stats::rnorm(length(date_of_service), 11, 4))), @@ -60,4 +62,7 @@ forager_data <- function(rows = 10){ date_of_adjudication = date_of_acceptance + round(abs(stats::rnorm(length(date_of_acceptance), 30, 3)))) |> tidyr::nest(dates = tidyr::contains("date")) + if(unnest) x <- tidyr::unnest_wider(x, dates) + + return(x) } diff --git a/R/generated-globals.R b/R/generated-globals.R index 2b7b5f5..e684760 100644 --- a/R/generated-globals.R +++ b/R/generated-globals.R @@ -19,6 +19,8 @@ utils::globalVariables(c( "date_of_service", # "date_of_submission", + # + "dates", # "group", # @@ -32,8 +34,14 @@ utils::globalVariables(c( # # # + # "n", + # + "type", + # # "value", + # + "variable", NULL )) diff --git a/man/describe.Rd b/man/describe.Rd new file mode 100644 index 0000000..c0e28ec --- /dev/null +++ b/man/describe.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/describe.R +\name{describe} +\alias{describe} +\title{Describe a dataset} +\usage{ +describe(df, ...) +} +\arguments{ +\item{df}{\verb{} desc} + +\item{...}{\verb{} tidyselect columns} +} +\value{ +\verb{} of summary statistics +} +\description{ +Describe a dataset +} +\examples{ +describe(fuimus:::provider_data(2000:2020)) + +describe( + fuimus:::forager_data(200), + !dplyr::starts_with("date") +) + +}