implement #50: compare_df_cols() and associated functions (#284)

* Allow data.frame row-binding comparison (Fix #50) * Allow list inputs to `compare_df_types()` * Address code review comments * typo columne -> column * make bind_rows the default value of bind_method swapping it in for rbind, since this is a tidyverse-aligned package and my quick poll shows more peers using dplyr::bind_rows * re-describe new functions
sfirke · Apr 20, 2019 · 51aa5df · 51aa5df
1 parent f44b71e
commit 51aa5df
Show file tree

Hide file tree

Showing 7 changed files with 756 additions and 2 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,8 @@ S3method(clean_names,default)
 S3method(clean_names,sf)
 S3method(crosstab,data.frame)
 S3method(crosstab,default)
+S3method(describe_class,default)
+S3method(describe_class,factor)
 S3method(fisher.test,default)
 S3method(fisher.test,tabyl)
 S3method(print,tabyl)
@@ -24,8 +26,11 @@ export(adorn_totals)
 export(as_tabyl)
 export(chisq.test)
 export(clean_names)
+export(compare_df_cols)
+export(compare_df_cols_same)
 export(convert_to_NA)
 export(crosstab)
+export(describe_class)
 export(excel_numeric_to_date)
 export(fisher.test)
 export(get_dupes)

diff --git a/NEWS.md b/NEWS.md
@@ -4,11 +4,13 @@
 
 The new function `make_clean_names()` takes a character vector and returns the cleaned text, with the same functionality as the existing `clean_names()`, which runs on a data.frame, manipulating its names. (#197, thanks **@tazinho** and everyone who contributed to the discussion).
 
-This new function can be supplied as a value for the `.name_repair` argument of `as_tibble()` in the `tibble` package.  For example: `as_tibble(iris, .name_repair = make_clean_names)`.
+This function can be supplied as a value for the `.name_repair` argument of `as_tibble()` in the `tibble` package.  For example: `as_tibble(iris, .name_repair = make_clean_names)`.
 
 `remove_empty()` now has a companion function `remove_constant()` which removes columns have a single value, optionally ignoring `NA` (#222, thanks to **@billdenney** for suggesting & implementing).
 
-Two new function `janitor::chisq.test()` and `janitor::fisher.test()` allow to apply their `stats` equivalent to two-way tabyl objects.
+Added the functions `janitor::chisq.test()` and `janitor::fisher.test()` to enable running these statistical tests from the base `stats` package on two-way `tabyl` objects.  While the package loading message says the base functions are masked, the base tests still run on `table` objects.
+
+The new function `compare_df_cols()` compares the names and classes of columns in a set of supplied data.frames or tibbles, reporting on the specific columns that are or are not similar.  This is for the common use case where a set of data files should all have the same specifications but, in practice, may not. A companion function `compare_df_cols_same()` gives a `TRUE/FALSE` result indicating if the columns are the same (and therefore bindable, though FALSE is not definitive that binding will fail).  The helper function `describe_class()` describes a variable's class to make differences between data.frames clear at a glance - it is used by developers in extending the `compare_df` functions to custom classes (#50, thanks to **@billdenney** for the feature.)
 
 ## Minor features
 

diff --git a/R/compare_df_cols.R b/R/compare_df_cols.R
@@ -0,0 +1,266 @@
+#' Generate a comparison of data.frames (or similar objects) that indicates if
+#' they will successfully bind together by rows.
+#'
+#' @details Due to the returned "column_name" column, no input data.frame may be
+#'   named "column_name".
+#'
+#'   The \code{strict_description} argument is most typically used to understand
+#'   if factor levels match or are bindable.  Factors are typically bindable,
+#'   but the behavior of what happens when they bind differs based on the
+#'   binding method ("bind_rows" or "rbind").  Even when
+#'   \code{strict_description} is \code{FALSE}, data.frames may still bind
+#'   because some classes (like factors and characters) can bind even if they
+#'   appear to differ.
+#'
+#' @param ... A combination of data.frames, tibbles, and lists of
+#'   data.frames/tibbles.  The values may optionally be named arguments; if
+#'   named, the output column will be the name; if not named, the output column
+#'   will be the data.frame name (see examples section).
+#' @param return Should a summary of "all" columns be returned, only return
+#'   "match"ing columns, or only "mismatch"ing columns?
+#' @param bind_method What method of binding should be used to determine
+#'   matches? With "bind_rows", columns missing from a data.frame would be
+#'   considered a match (as in \code{dplyr::bind_rows()}; with "rbind", columns
+#'   missing from a data.frame would be considered a mismatch (as in
+#'   \code{base::rbind()}.
+#' @param strict_description Passed to \code{describe_class}.  Also, see the
+#'   Details section.
+#' @return A data.frame with a column named "column_name" with a value named
+#'   after the input data.frames' column names, and then one column per
+#'   data.frame (named after the input data.frame).  If more than one input has
+#'   the same column name, the column naming will have suffixes defined by
+#'   sequential use of \code{base::merge()} and may differ from expected naming.
+#'   The rows within the data.frame-named columns are descriptions of the
+#'   classes of the data within the columns (generated by
+#'   \code{describe_class}).
+#' @examples
+#' compare_df_cols(data.frame(A=1), data.frame(B=2))
+#' # user-defined names
+#' compare_df_cols(dfA=data.frame(A=1), dfB=data.frame(B=2))
+#' # a combination of list and data.frame input
+#' compare_df_cols(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3))
+#' @family Data frame type comparison
+#' @export
+compare_df_cols <- function(..., return=c("all", "match", "mismatch"), bind_method=c("bind_rows", "rbind"), strict_description=FALSE) {
+  # Input checking
+  return <- match.arg(return)
+  bind_method <- match.arg(bind_method)
+  args <- list(...)
+  mask_input_data_frame <- sapply(X=args, FUN=is.data.frame)
+  mask_input_list <- sapply(X=args, FUN=is.list) & !mask_input_data_frame
+  mask_input_other <- !(mask_input_data_frame | mask_input_list)
+  if (any(mask_input_other)) {
+    stop("Input given with `...` must be either a data.frame or a list of data.frames.")
+  }
+  bad_list_inputs <- numeric(0)
+  for (idx in which(mask_input_list)) {
+    bad_list_inputs <-
+      c(
+        bad_list_inputs,
+        if (!all(sapply(X=args[[idx]], FUN=is.data.frame))) {
+          idx
+        } else {
+          numeric(0)
+        }
+      )
+  }
+  if (length(bad_list_inputs)) {
+    stop(
+      "List inputs must be lists of data.frames.  List input ",
+      if (length(bad_list_inputs) == 1) {
+        paste("number", bad_list_inputs, "is not a list of data.frames.")
+      } else if (length(bad_list_inputs) < 6) {
+        paste("numbers", paste(bad_list_inputs, collapse=", "), "are not lists of data.frames.")
+      } else {
+        paste("numbers", paste(c(bad_list_inputs[1:5], "..."), collapse=", "), "are not lists of data.frames.")
+      }
+    )
+  }
+
+  # Generate and check column names
+  direct_names <- names(args)
+  indirect_names <- as.character(match.call(expand.dots=TRUE))
+  indirect_names <- indirect_names[!(indirect_names %in% as.character(match.call(expand.dots=FALSE)))]
+  if (is.null(direct_names)) {
+    final_names <- indirect_names
+  } else {
+    final_names <- direct_names
+    mask_replace <- final_names %in% ""
+    final_names[mask_replace] <- indirect_names[mask_replace]
+  }
+  final_names <- as.list(final_names)
+  for (idx in which(mask_input_list)) {
+    current_list_names <- names(args[[idx]])
+    final_names[[idx]] <-
+      if (is.null(current_list_names)) {
+        paste(final_names[[idx]], seq_along(args[[idx]]), sep="_")
+      } else if (any(mask_unnamed_list <- current_list_names %in% "")) {
+        current_list_names[mask_unnamed_list] <-
+          paste(
+            final_names[[idx]][mask_unnamed_list],
+            seq_len(sum(mask_unnamed_list)),
+            sep="_"
+          )
+        current_list_names
+      } else {
+        current_list_names
+      }
+  }
+  if (any(unlist(final_names) %in% "column_name")) {
+    stop("None of the input ... argument names or list names may be `column_name`.")
+  }
+  ret <- compare_df_cols_df_maker(args, class_colname=final_names, strict_description=strict_description)
+  if (return == "all" | ncol(ret) == 2) {
+    if (return != "all") {
+      warning("Only one data.frame provided, so all its classes are provided.")
+    }
+    rownames(ret) <- NULL
+    ret
+  } else {
+    # Choose which way to test if the rows are bindable (NA matches or not).
+    bind_method_fun <-
+      list(
+        rbind=function(idx) {
+          all(unlist(ret[idx,3:ncol(ret)]) %in% ret[idx,2])
+        },
+        bind_rows=function(idx) {
+          all(
+            unlist(ret[idx,3:ncol(ret)]) %in%
+              c(NA_character_,
+                na.omit(unlist(ret[idx,2:ncol(ret)]))[1])
+          )
+        }
+      )
+    mask_match <-
+      sapply(
+        X=seq_len(nrow(ret)),
+        FUN=bind_method_fun[[bind_method]]
+      )
+    ret <-
+      if (return == "match") {
+        ret[mask_match,]
+      } else if (return == "mismatch") {
+        ret[!mask_match,]
+      }
+    rownames(ret) <- NULL
+    ret
+  }
+}
+
+#' This is the workhorse for making a data.frame description used by
+#' compare_df_cols
+#' @param x The data.frame or list of data.frames
+#' @param class_colname The name for the column-name-defining column
+#' @param strict_description Passed to \code{describe_class}
+#' @return A 2-column data.frame with the first column naming all the columns of
+#'   \code{x} and the second column (named after the value in
+#'   \code{class_colname}) defining the classes using
+#'   \code{describe_class()}.
+#' @noRd
+compare_df_cols_df_maker <- function(x, class_colname="class", strict_description)
+  UseMethod("compare_df_cols_df_maker")
+
+compare_df_cols_df_maker.data.frame <- function(x, class_colname="class", strict_description) {
+  if (class_colname == "column_name") {
+    stop('`class_colname` cannot be "column_name"')
+  }
+  if (ncol(x) == 0) {
+    warning(class_colname, " has zero columns and will not appear in output.")
+    ret <- data.frame(column_name=character(0), stringsAsFactors=FALSE)
+  } else {
+    ret <-
+      data.frame(
+        column_name=names(x),
+        X=sapply(X=x, FUN=describe_class, strict_description=strict_description),
+        stringsAsFactors=FALSE
+      )
+    names(ret)[2] <- class_colname
+  }
+  ret
+}
+
+compare_df_cols_df_maker.list <- function(x, class_colname="class", strict_description=strict_description) {
+  if (length(class_colname) != length(x)) {
+    stop("`x` and `class_colname` must be the same length.")
+  } else if (any(class_colname == "column_name")) {
+    stop('`class_colname` cannot be "column_name"')
+  }
+  ret <-
+    lapply(
+      X=seq_along(x),
+      FUN=function(idx) {
+        compare_df_cols_df_maker(x=x[[idx]], class_colname=class_colname[[idx]], strict_description=strict_description)
+      }
+    )
+  Reduce(f=function(x, y) {merge(x, y, by="column_name", all=TRUE)}, x=ret)
+}
+
+#' Are the data.frames the same?
+#'
+#' @inheritParams compare_df_cols
+#' @param verbose Print the mismatching columns if binding will fail.
+#' @return \code{TRUE} if row binding will succeed or \code{FALSE} if it will
+#'   fail.
+#' @family Data frame type comparison
+#' @examples
+#' compare_df_cols_same(data.frame(A=1), data.frame(A=2))
+#' compare_df_cols_same(data.frame(A=1), data.frame(B=2))
+#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), verbose=FALSE)
+#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="rbind")
+#' @export
+compare_df_cols_same <- function(..., return="mismatch", bind_method=c("bind_rows", "rbind"), verbose=TRUE) {
+  return <- match.arg(return)
+  bind_method <- match.arg(bind_method)
+  ret <- compare_df_cols(..., return=return, bind_method=bind_method)
+  if (nrow(ret) & verbose) {
+    print(ret)
+  }
+  nrow(ret) == 0
+}
+
+#' Describe the class(es) of an object
+#'
+#' @details For package developers, an S3 generic method can be written for
+#'   \code{describe_class()} for custom classes that may need more definition
+#'   than the default method.
+#'
+#' @param x The object to describe
+#' @param strict_description Should the 
+#' @return A character scalar describing the class(es) of an object where if the
+#'   scalar will match, columns in a data.frame (or similar object) should bind
+#'   together without issue.
+#' @family Data frame type comparison
+#' @examples
+#' describe_class(1)
+#' describe_class(factor("A"))
+#' describe_class(ordered(c("A", "B")))
+#' describe_class(ordered(c("A", "B")), strict_description=FALSE)
+#' @export
+describe_class <- function(x, strict_description=TRUE) {
+  UseMethod("describe_class")
+}
+
+#' @describeIn describe_class Describe factors with their levels
+#'   and if they are ordered.
+#' @export
+describe_class.factor <- function(x, strict_description=TRUE) {
+  if (strict_description) {
+    all_classes <- class(x)
+    all_levels <- levels(x)
+    level_text <- sprintf("levels=c(%s)", paste('"', levels(x), '"', sep="", collapse=", "))
+    factor_text <- sprintf("factor(%s)", level_text)
+    mask_factor <- class(x) == "factor"
+    all_classes[mask_factor] <- factor_text
+    paste(all_classes, collapse=", ")
+  } else {
+    all_classes <- setdiff(class(x), "ordered")
+    paste(all_classes, collapse=", ")
+  }
+}
+
+#' @describeIn describe_class List all classes of an object.
+#' @export
+describe_class.default <- function(x, strict_description=TRUE) {
+  all_classes <- class(x)
+  paste(all_classes, collapse=", ")
+}
diff --git a/man/compare_df_cols.Rd b/man/compare_df_cols.Rd