Skip to content

Commit

Permalink
* describe() first draft
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewallenbruce committed Jul 30, 2024
1 parent 3e8c641 commit a2f2e0b
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 4 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Depends:
Imports:
clock,
collapse,
cheapr,
pillar,
dplyr,
forcats,
fs,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export(count_wide)
export(create_vec)
export(delister)
export(density)
export(describe)
export(df_2_chr)
export(display_long)
export(duration_vec)
Expand Down
80 changes: 80 additions & 0 deletions R/describe.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#' Describe a dataset
#'
#' @param df `<data.frame>` desc
#'
#' @param ... `<dots>` tidyselect columns
#'
#' @returns `<tibble>` of summary statistics
#'
#' @examples
#' describe(fuimus:::provider_data(2000:2020))
#'
#' describe(
#' fuimus:::forager_data(200),
#' !dplyr::starts_with("date")
#' )
#'
#' @autoglobal
#'
#' @export
describe <- function(df, ...) {

if (nargs() > 1) df <- dplyr::select(df, ...)

df_sums <- df |>
dplyr::mutate_if(is.character, stringr::str_length) |>
dplyr::mutate_if(is.factor, as.numeric) |>
dplyr::mutate_if(is.logical, as.numeric) |>
tidyr::pivot_longer(
cols = dplyr::everything(),
names_to = "variable"
) |>
dplyr::mutate(n = 1 - is.na(value)) |>
dplyr::reframe(
n = as.integer(base::sum(n)),
amean = base::mean(value, na.rm = TRUE),
gmean = fuimus::geomean(value),
sd = stats::sd(value, na.rm = TRUE),
iqr = stats::IQR(value, na.rm = TRUE),
median = stats::median(value, na.rm = TRUE),
mad = stats::mad(value, na.rm = TRUE),
range = as.character(stringr::str_glue(
"[",
"{base::min(value, na.rm = TRUE)}",
" - ",
"{as.integer(base::max(value, na.rm = TRUE))}",
"]"
)
),
hist = cheapr:::inline_hist(value),
.by = variable
)

get_type <- \(x) dplyr::tibble(
variable = names(x),
type = stringr::str_c("<", pillar::type_sum(x), ">") |>
forcats::as_factor()
)

df_types <- purrr::map(df, get_type) |>
purrr::list_rbind(names_to = "variable")

get_unique <- \(x, limit = 5) dplyr::tibble(
variable = names(x),
n_uniq = collapse::fnunique(collapse::na_rm(x)),
top_5 = collapse::fcount(collapse::na_rm(x), name = "n") |>
dplyr::arrange(dplyr::desc(n)) |>
dplyr::slice(1:limit) |>
dplyr::pull(x) |>
stringr::str_flatten_comma()
)

df_unique <- purrr::map(df, get_unique) |>
purrr::list_rbind(names_to = "variable")

joinby <- dplyr::join_by(variable)

dplyr::left_join(df_types, df_sums, by = joinby) |>
dplyr::left_join(df_unique, by = joinby) |>
dplyr::arrange(dplyr::desc(type))
}
13 changes: 9 additions & 4 deletions R/fake_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ provider_data <- function(year_seq) {
#'
#' @param rows number of rows to generate; default is 10
#'
#' @param unnest a logical indicating whether to unnest the dates column; default is `FALSE`
#'
#' @returns A [tibble][tibble::tibble-package]
#'
#' @examplesIf interactive()
Expand All @@ -41,17 +43,17 @@ provider_data <- function(year_seq) {
#' @autoglobal
#'
#' @noRd
forager_data <- function(rows = 10){
forager_data <- function(rows = 10, unnest = FALSE){

dplyr::tibble(
claim_id = wakefield::id(n = rows),
x <- dplyr::tibble(
claim_id = as.character(wakefield::id(n = rows)),
date_of_service = wakefield::date_stamp(n = rows,
start = lubridate::today() - lubridate::dyears(2),
random = TRUE),
payer = fixtuRes::set_vector(rows,
set = c("Medicare", "Medicaid", "Cigna", "Humana", "UnitedHealth", "Anthem", "BCBS", "Centene")),
ins_class = fixtuRes::set_vector(rows, set = c("Primary", "Secondary")),
balance = wakefield::income(n = rows, digits = 2) / 300) |>
balance = as.double(wakefield::income(n = rows, digits = 2) / 300)) |>
dplyr::mutate(
date_of_service = lubridate::as_date(date_of_service),
date_of_release = date_of_service + round(abs(stats::rnorm(length(date_of_service), 11, 4))),
Expand All @@ -60,4 +62,7 @@ forager_data <- function(rows = 10){
date_of_adjudication = date_of_acceptance + round(abs(stats::rnorm(length(date_of_acceptance), 30, 3)))) |>
tidyr::nest(dates = tidyr::contains("date"))

if(unnest) x <- tidyr::unnest_wider(x, dates)

return(x)
}
8 changes: 8 additions & 0 deletions R/generated-globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ utils::globalVariables(c(
"date_of_service",
# <forager_data>
"date_of_submission",
# <forager_data>
"dates",
# <id_runs>
"group",
# <id_runs>
Expand All @@ -32,8 +34,14 @@ utils::globalVariables(c(
# <count_prop>
# <count_prop_multi>
# <count_wide>
# <describe>
"n",
# <describe>
"type",
# <describe>
# <id_runs>
"value",
# <describe>
"variable",
NULL
))
28 changes: 28 additions & 0 deletions man/describe.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a2f2e0b

Please sign in to comment.