From 16dba690c6c915e5ba11d8b114dee755cd17d627 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 11 Apr 2019 22:06:34 -0400 Subject: [PATCH 1/6] Allow data.frame row-binding comparison (Fix #50) --- NAMESPACE | 5 + NEWS.md | 2 + R/compare_df_types.R | 174 +++++++++++++++++++++++++ man/compare_df_types.Rd | 49 +++++++ man/compare_df_types_class_detect.Rd | 43 ++++++ man/compare_df_types_success.Rd | 43 ++++++ tests/testthat/test-compare_df_types.R | 147 +++++++++++++++++++++ 7 files changed, 463 insertions(+) create mode 100644 R/compare_df_types.R create mode 100644 man/compare_df_types.Rd create mode 100644 man/compare_df_types_class_detect.Rd create mode 100644 man/compare_df_types_success.Rd create mode 100644 tests/testthat/test-compare_df_types.R diff --git a/NAMESPACE b/NAMESPACE index 7fef5420..cf280c11 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,8 @@ S3method(chisq.test,default) S3method(chisq.test,tabyl) S3method(clean_names,default) S3method(clean_names,sf) +S3method(compare_df_types_class_detect,default) +S3method(compare_df_types_class_detect,factor) S3method(crosstab,data.frame) S3method(crosstab,default) S3method(fisher.test,default) @@ -24,6 +26,9 @@ export(adorn_totals) export(as_tabyl) export(chisq.test) export(clean_names) +export(compare_df_types) +export(compare_df_types_class_detect) +export(compare_df_types_success) export(convert_to_NA) export(crosstab) export(excel_numeric_to_date) diff --git a/NEWS.md b/NEWS.md index 47714652..859153ec 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ This new function can be supplied as a value for the `.name_repair` argument of Two new function `janitor::chisq.test()` and `janitor::fisher.test()` allow to apply their `stats` equivalent to two-way tabyl objects. +The new function `compare_df_types()` allows checking if data.frame (or similar object) row binding will succeed and reports on specific columns that will or will not succeed. A companion function `compare_df_types_success()` gives a TRUE/FALSE result (simplifying testing if the bind will work in code), and `compare_df_types_class_detect()` is a generic that allows overriding defaults if binding will work for specific classes (#50, thanks to **@billdenney** for the feature.) + ## Minor features * `excel_numeric_to_date()` now returns a POSIXct object and includes a time zone. (#225, thanks to **@billdenney** for the feature.) diff --git a/R/compare_df_types.R b/R/compare_df_types.R new file mode 100644 index 00000000..c1624d55 --- /dev/null +++ b/R/compare_df_types.R @@ -0,0 +1,174 @@ +#' Generate a comparison of data.frames (or similar objects) that indicates if +#' they will successfully bind together by rows. +#' +#' @details Due to the returned "column_name" column, no input data.frame may be +#' named "column_name". +#' +#' @param ... data.frames or similar objects. The values may optionally be +#' named arguments; if named, the output column will be the name; if not +#' named, the output columne will be the data.frame name (see examples +#' section). +#' @param return Should a sumary of "all" columns be returned, only return +#' "match"ing columns, or only "mismatch"ing columns. +#' @param bind_check What method of binding shoudl be used to determine matches? +#' With "rbind", columns missing from a data.frame would be considered a +#' mismatch; with "bind_rows" columns missing from a data.frame would be +#' considered a match (as in \code{dplyr::bind_rows()}. +#' @return A data.frame with a column named "column_name" and other columns +#' named after the input data.frames' column names, and then one column per +#' data.frame (named after the input data.frame). The rows within the +#' data.frame-named columns are descriptions of the classes of the data within +#' the columns (generated by \code{compare_df_types_class_detect}). +#' @examples +#' compare_df_types(data.frame(A=1), data.frame(B=2)) +#' # user-defined names +#' compare_df_types(dfA=data.frame(A=1), dfB=data.frame(B=2)) +#' @family Data frame type comparison +#' @export +compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_check=c("rbind", "bind_rows")) { + return <- match.arg(return) + bind_check <- match.arg(bind_check) + direct_names <- names(list(...)) + indirect_names <- + setdiff( + as.character(match.call(expand.dots=TRUE)), + as.character(match.call(expand.dots=FALSE)) + ) + if (is.null(direct_names)) { + final_names <- indirect_names + } else { + final_names <- direct_names + mask_replace <- final_names %in% "" + final_names[mask_replace] <- indirect_names[mask_replace] + } + if (any(final_names %in% "column_name")) { + stop("None of the input ... arguments must be named `column_name`.") + } + args <- list(...) + ret <- compare_df_types_df_maker(args[[1]], class_colname=final_names[1]) + for (idx in (1+seq_len(length(args) - 1))) { + ret <- + merge( + ret, + compare_df_types_df_maker(args[[idx]], class_colname=final_names[idx]), + by="column_name", + all=TRUE + ) + } + if (return == "all" | ncol(ret) == 2) { + if (return != "all") { + warning("Only one data.frame provided, so all its classes are provided.") + } + rownames(ret) <- NULL + ret + } else { + # Is this the best way to check for all row values to be equal? + bind_check_fun <- + list( + rbind=function(idx) { + all(unlist(ret[idx,3:ncol(ret)]) %in% ret[idx,2]) + }, + bind_rows=function(idx) { + all( + unlist(ret[idx,3:ncol(ret)]) %in% + c(NA_character_, + na.omit(unlist(ret[idx,2:ncol(ret)]))[1]) + ) + } + ) + mask_match <- + sapply( + X=seq_len(nrow(ret)), + FUN=bind_check_fun[[bind_check]] + ) + ret <- + if (return == "match") { + ret[mask_match,] + } else if (return == "mismatch") { + ret[!mask_match,] + } + rownames(ret) <- NULL + ret + } +} + +#' This is the workhorse for making a data.frame description used by +#' compare_df_types +#' @param x The data.frame +#' @param class_colname The name for the column-name-defining column +#' @return A 2-column data.frame with the first column naming all the columns of +#' \code{x} and the second column (named after the value in +#' \code{class_colname}) defining the classes using +#' \code{compare_df_types_class_detect()}. +#' @noRd +compare_df_types_df_maker <- function(x, class_colname="class") { + if (class_colname == "column_name") { + stop('`class_colname` cannot be "column_name"') + } + ret <- + data.frame( + column_name=names(x), + X=sapply(X=x, FUN=compare_df_types_class_detect), + stringsAsFactors=FALSE + ) + names(ret)[2] <- class_colname + ret +} + +#' Will row binding succeed? +#' @inheritParams compare_df_types +#' @param verbose Print the mismatching columns if binding will fail. +#' @return \code{TRUE} if row binding will succeed or \code{FALSE} if it will +#' fail. +#' @family Data frame type comparison +#' @examples +#' compare_df_types_success(data.frame(A=1), data.frame(A=2)) +#' compare_df_types_success(data.frame(A=1), data.frame(B=2)) +#' compare_df_types_success(data.frame(A=1), data.frame(B=2), verbose=FALSE) +#' compare_df_types_success(data.frame(A=1), data.frame(B=2), bind_check="bind_rows") +#' @export +compare_df_types_success <- function(..., return="mismatch", bind_check=c("rbind", "bind_rows"), verbose=TRUE) { + return <- match.arg(return) + bind_check <- match.arg(bind_check) + ret <- compare_df_types(..., return=return, bind_check=bind_check) + if (nrow(ret) & verbose) { + print(ret) + } + nrow(ret) == 0 +} + +#' Describe the class(es) of an object +#' +#' @details An S3 generic method can be written for +#' \code{compare_df_types_class_detect()} for other types that may need more +#' definition than the default method. +#' +#' @param x The object to describe +#' @return A character scalar describing the class(es) of an object where if the +#' scalar will match, columns in a data.frame (or similar object) should bind +#' together without issue. +#' @family Data frame type comparison +#' @export +compare_df_types_class_detect <- function(x) { + UseMethod("compare_df_types_class_detect") +} + +#' @describeIn compare_df_types_class_detect Describe factors with their levels +#' and if they are ordered. +#' @export +compare_df_types_class_detect.factor <- function(x) { + all_classes <- class(x) + all_levels <- levels(x) + level_text <- sprintf("levels=c(%s)", paste('"', levels(x), '"', sep="", collapse=", ")) + factor_text <- sprintf("factor(%s)", level_text) + mask_factor <- class(x) == "factor" + all_classes[mask_factor] <- factor_text + paste(all_classes, collapse=", ") +} + +#' @describeIn compare_df_types_class_detect List all classes of an object. +#' @export +compare_df_types_class_detect.default <- function(x) { + all_classes <- class(x) + paste(all_classes, collapse=", ") +} diff --git a/man/compare_df_types.Rd b/man/compare_df_types.Rd new file mode 100644 index 00000000..e6743f61 --- /dev/null +++ b/man/compare_df_types.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_df_types.R +\name{compare_df_types} +\alias{compare_df_types} +\title{Generate a comparison of data.frames (or similar objects) that indicates if +they will successfully bind together by rows.} +\usage{ +compare_df_types(..., return = c("all", "match", "mismatch"), + bind_check = c("rbind", "bind_rows")) +} +\arguments{ +\item{...}{data.frames or similar objects. The values may optionally be +named arguments; if named, the output column will be the name; if not +named, the output columne will be the data.frame name (see examples +section).} + +\item{return}{Should a sumary of "all" columns be returned, only return +"match"ing columns, or only "mismatch"ing columns.} + +\item{bind_check}{What method of binding shoudl be used to determine matches? +With "rbind", columns missing from a data.frame would be considered a +mismatch; with "bind_rows" columns missing from a data.frame would be +considered a match (as in \code{dplyr::bind_rows()}.} +} +\value{ +A data.frame with a column named "column_name" and other columns + named after the input data.frames' column names, and then one column per + data.frame (named after the input data.frame). The rows within the + data.frame-named columns are descriptions of the classes of the data within + the columns (generated by \code{compare_df_types_class_detect}). +} +\description{ +Generate a comparison of data.frames (or similar objects) that indicates if +they will successfully bind together by rows. +} +\details{ +Due to the returned "column_name" column, no input data.frame may be + named "column_name". +} +\examples{ +compare_df_types(data.frame(A=1), data.frame(B=2)) +# user-defined names +compare_df_types(dfA=data.frame(A=1), dfB=data.frame(B=2)) +} +\seealso{ +Other Data frame type comparison: \code{\link{compare_df_types_class_detect}}, + \code{\link{compare_df_types_success}} +} +\concept{Data frame type comparison} diff --git a/man/compare_df_types_class_detect.Rd b/man/compare_df_types_class_detect.Rd new file mode 100644 index 00000000..fd0d0d6b --- /dev/null +++ b/man/compare_df_types_class_detect.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_df_types.R +\name{compare_df_types_class_detect} +\alias{compare_df_types_class_detect} +\alias{compare_df_types_class_detect.factor} +\alias{compare_df_types_class_detect.default} +\title{Describe the class(es) of an object} +\usage{ +compare_df_types_class_detect(x) + +\method{compare_df_types_class_detect}{factor}(x) + +\method{compare_df_types_class_detect}{default}(x) +} +\arguments{ +\item{x}{The object to describe} +} +\value{ +A character scalar describing the class(es) of an object where if the + scalar will match, columns in a data.frame (or similar object) should bind + together without issue. +} +\description{ +Describe the class(es) of an object +} +\details{ +An S3 generic method can be written for + \code{compare_df_types_class_detect()} for other types that may need more + definition than the default method. +} +\section{Methods (by class)}{ +\itemize{ +\item \code{factor}: Describe factors with their levels +and if they are ordered. + +\item \code{default}: List all classes of an object. +}} + +\seealso{ +Other Data frame type comparison: \code{\link{compare_df_types_success}}, + \code{\link{compare_df_types}} +} +\concept{Data frame type comparison} diff --git a/man/compare_df_types_success.Rd b/man/compare_df_types_success.Rd new file mode 100644 index 00000000..efb01cd9 --- /dev/null +++ b/man/compare_df_types_success.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_df_types.R +\name{compare_df_types_success} +\alias{compare_df_types_success} +\title{Will row binding succeed?} +\usage{ +compare_df_types_success(..., return = "mismatch", + bind_check = c("rbind", "bind_rows"), verbose = TRUE) +} +\arguments{ +\item{...}{data.frames or similar objects. The values may optionally be +named arguments; if named, the output column will be the name; if not +named, the output columne will be the data.frame name (see examples +section).} + +\item{return}{Should a sumary of "all" columns be returned, only return +"match"ing columns, or only "mismatch"ing columns.} + +\item{bind_check}{What method of binding shoudl be used to determine matches? +With "rbind", columns missing from a data.frame would be considered a +mismatch; with "bind_rows" columns missing from a data.frame would be +considered a match (as in \code{dplyr::bind_rows()}.} + +\item{verbose}{Print the mismatching columns if binding will fail.} +} +\value{ +\code{TRUE} if row binding will succeed or \code{FALSE} if it will + fail. +} +\description{ +Will row binding succeed? +} +\examples{ +compare_df_types_success(data.frame(A=1), data.frame(A=2)) +compare_df_types_success(data.frame(A=1), data.frame(B=2)) +compare_df_types_success(data.frame(A=1), data.frame(B=2), verbose=FALSE) +compare_df_types_success(data.frame(A=1), data.frame(B=2), bind_check="bind_rows") +} +\seealso{ +Other Data frame type comparison: \code{\link{compare_df_types_class_detect}}, + \code{\link{compare_df_types}} +} +\concept{Data frame type comparison} diff --git a/tests/testthat/test-compare_df_types.R b/tests/testthat/test-compare_df_types.R new file mode 100644 index 00000000..58082ec0 --- /dev/null +++ b/tests/testthat/test-compare_df_types.R @@ -0,0 +1,147 @@ +context("compare_df_types") + + +test_that("data.frame comparison works", { + # Names are intentionally not pretty to make it easier to see the source. + # These data.frames are not typically used, and if the input name is needed, + # it can be a named argument. + expect_equal( + compare_df_types(data.frame(A=1), data.frame(B=2)), + setNames( + data.frame( + column_name=c("A", "B"), + A=c("numeric", NA), + B=c(NA, "numeric"), + stringsAsFactors=FALSE + ), + c("column_name", + "data.frame(A = 1)", + "data.frame(B = 2)") + ), + info="Names are detected from unnamed input" + ) + expect_equal( + compare_df_types(foo=data.frame(A=1), bar=data.frame(B=2)), + data.frame( + column_name=c("A", "B"), + foo=c("numeric", NA), + bar=c(NA, "numeric"), + stringsAsFactors=FALSE + ), + info="Names can be used from the input" + ) + expect_equal( + compare_df_types(foo=data.frame(A=1), data.frame(B=2)), + setNames( + data.frame( + column_name=c("A", "B"), + A=c("numeric", NA), + B=c(NA, "numeric"), + stringsAsFactors=FALSE + ), + c("column_name", + "foo", + "data.frame(B = 2)") + ), + info="Names are detected from unnamed input and can be mixed with named arguments" + ) + expect_equal( + compare_df_types(foo=data.frame(A=1, B=1), bar=data.frame(B=2)), + data.frame( + column_name=c("A", "B"), + foo="numeric", + bar=c(NA, "numeric"), + stringsAsFactors=FALSE + ), + info="all output comes through when requested" + ) + expect_equal( + compare_df_types(foo=data.frame(A=1, B=1), bar=data.frame(B=2), return="match"), + data.frame( + column_name="B", + foo="numeric", + bar="numeric", + stringsAsFactors=FALSE + ), + info="only matching output comes through when requested" + ) + expect_equal( + compare_df_types(foo=data.frame(A=1, B=1), bar=data.frame(B=2), return="mismatch"), + data.frame( + column_name="A", + foo="numeric", + bar=NA_character_, + stringsAsFactors=FALSE + ), + info="only mismatching output comes through when requested" + ) + expect_warning( + expect_equal( + compare_df_types(foo=data.frame(A=1, B=1), return="mismatch"), + data.frame( + column_name=c("A", "B"), + foo="numeric", + stringsAsFactors=FALSE + ), + info="A single data.frame gives all results" + ), + info="A single data.frame isn't very meaningful, so the user is warned that they probably didn't do what they meant to do." + ) + expect_equal( + compare_df_types( + foo=data.frame(A=1, B=1, C=factor("A"), D=factor("B")), + bar=data.frame(B=2, C=factor("A"), D=factor(c("A", "B"))), + return="mismatch" + ), + data.frame( + column_name=c("A", "D"), + foo=c("numeric", 'factor(levels=c("B"))'), + bar=c(NA_character_, 'factor(levels=c("A", "B"))'), + stringsAsFactors=FALSE + ), + info="only mismatching output comes through when requested (and it works with something a bit more complex than numeric)" + ) + expect_equal( + compare_df_types( + foo=data.frame(A=1, B=1, C=factor("A"), D=factor("B")), + bar=data.frame(B=2, C=factor("A"), D=factor(c("A", "B"))), + return="mismatch", + bind_check="bind_rows" + ), + data.frame( + column_name="D", + foo='factor(levels=c("B"))', + bar='factor(levels=c("A", "B"))', + stringsAsFactors=FALSE + ), + info="bind_rows output skips NA" + ) +}) + +test_that("class detection works", { + expect_equal(compare_df_types_class_detect(5), "numeric") + expect_equal(compare_df_types_class_detect("A"), "character") + expect_equal( + compare_df_types_class_detect(as.POSIXct("2019-01-02")), "POSIXct, POSIXt", + info="multiple classes work" + ) + expect_equal( + compare_df_types_class_detect(factor("A")), + 'factor(levels=c("A"))' + ) + expect_equal( + compare_df_types_class_detect(factor("A", ordered=TRUE)), + 'ordered, factor(levels=c("A"))' + ) + expect_equal( + compare_df_types_class_detect(factor(c("A", "B"), ordered=TRUE)), + 'ordered, factor(levels=c("A", "B"))' + ) +}) + +test_that("boolean df comparison works", { + expect_true(compare_df_types_success(data.frame(A=1), data.frame(A=2))) + expect_output(expect_false(compare_df_types_success(data.frame(A=1), data.frame(B=2)))) + expect_silent(expect_false(compare_df_types_success(data.frame(A=1), data.frame(B=2), verbose=FALSE))) + expect_true(compare_df_types_success(data.frame(A=1), data.frame(B=2), bind_check="bind_rows")) +}) From bc13ad0361b50bf941e90e175c23bed0f1a0780c Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Mon, 15 Apr 2019 10:34:29 -0400 Subject: [PATCH 2/6] Allow list inputs to `compare_df_types()` --- R/compare_df_types.R | 141 +++++++++++++++++++------ man/compare_df_types.Rd | 33 +++--- man/compare_df_types_success.Rd | 19 ++-- tests/testthat/test-compare_df_types.R | 136 +++++++++++++++++++++++- 4 files changed, 270 insertions(+), 59 deletions(-) diff --git a/R/compare_df_types.R b/R/compare_df_types.R index c1624d55..05bc5663 100644 --- a/R/compare_df_types.R +++ b/R/compare_df_types.R @@ -4,31 +4,70 @@ #' @details Due to the returned "column_name" column, no input data.frame may be #' named "column_name". #' -#' @param ... data.frames or similar objects. The values may optionally be -#' named arguments; if named, the output column will be the name; if not -#' named, the output columne will be the data.frame name (see examples -#' section). -#' @param return Should a sumary of "all" columns be returned, only return -#' "match"ing columns, or only "mismatch"ing columns. -#' @param bind_check What method of binding shoudl be used to determine matches? +#' @param ... data.frames or similar objects (such as tibbles), or lists of +#' data.frames (or similar objects). The values may optionally be named +#' arguments; if named, the output column will be the name; if not named, the +#' output columne will be the data.frame name (see examples section). +#' @param return Should a summary of "all" columns be returned, only return +#' "match"ing columns, or only "mismatch"ing columns? +#' @param bind_check What method of binding should be used to determine matches? #' With "rbind", columns missing from a data.frame would be considered a -#' mismatch; with "bind_rows" columns missing from a data.frame would be -#' considered a match (as in \code{dplyr::bind_rows()}. -#' @return A data.frame with a column named "column_name" and other columns -#' named after the input data.frames' column names, and then one column per -#' data.frame (named after the input data.frame). The rows within the -#' data.frame-named columns are descriptions of the classes of the data within -#' the columns (generated by \code{compare_df_types_class_detect}). +#' mismatch (as in \code{base::rbind()}; with "bind_rows", columns missing +#' from a data.frame would be considered a match (as in +#' \code{dplyr::bind_rows()}. +#' @return A data.frame with a column named "column_name" with a value named +#' after the input data.frames' column names, and then one column per +#' data.frame (named after the input data.frame). If more than one input +#' would have the same column name, the column naming is from +#' \code{base::merge()} and may differ from expected naming. The rows within +#' the data.frame-named columns are descriptions of the classes of the data +#' within the columns (generated by \code{compare_df_types_class_detect}). #' @examples #' compare_df_types(data.frame(A=1), data.frame(B=2)) #' # user-defined names #' compare_df_types(dfA=data.frame(A=1), dfB=data.frame(B=2)) +#' # a combinatino of list and data.frame input +#' compare_df_types(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) #' @family Data frame type comparison #' @export compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_check=c("rbind", "bind_rows")) { + # Input checking return <- match.arg(return) bind_check <- match.arg(bind_check) - direct_names <- names(list(...)) + args <- list(...) + mask_input_data_frame <- sapply(X=args, FUN=is.data.frame) + mask_input_list <- sapply(X=args, FUN=is.list) & !mask_input_data_frame + mask_input_other <- !(mask_input_data_frame | mask_input_list) + if (any(mask_input_other)) { + stop("Input given with `...` must be either a data.frame or a list of data.frames.") + } + bad_list_inputs <- numeric(0) + for (idx in which(mask_input_list)) { + bad_list_inputs <- + c( + bad_list_inputs, + if (!all(sapply(X=args[[idx]], FUN=is.data.frame))) { + idx + } else { + numeric(0) + } + ) + } + if (length(bad_list_inputs)) { + stop( + "List inputs must be lists of data.frames. List input ", + if (length(bad_list_inputs) == 1) { + paste("number", bad_list_inputs, "is not a list of data.frames.") + } else if (length(bad_list_inputs) < 6) { + paste("numbers", paste(bad_list_inputs, collapse=", "), "are not lists of data.frames.") + } else { + paste("numbers", paste(c(bad_list_inputs[1:5], "..."), collapse=", "), "are not lists of data.frames.") + } + ) + } + + # Generate and check column names + direct_names <- names(args) indirect_names <- setdiff( as.character(match.call(expand.dots=TRUE)), @@ -41,20 +80,28 @@ compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_che mask_replace <- final_names %in% "" final_names[mask_replace] <- indirect_names[mask_replace] } - if (any(final_names %in% "column_name")) { - stop("None of the input ... arguments must be named `column_name`.") + final_names <- as.list(final_names) + for (idx in which(mask_input_list)) { + current_list_names <- names(args[[idx]]) + final_names[[idx]] <- + if (is.null(current_list_names)) { + paste(final_names[[idx]], seq_along(args[[idx]]), sep="_") + } else if (any(mask_unnamed_list <- current_list_names %in% "")) { + current_list_names[mask_unnamed_list] <- + paste( + final_names[[idx]][mask_unnamed_list], + seq_len(sum(mask_unnamed_list)), + sep="_" + ) + current_list_names + } else { + current_list_names + } } - args <- list(...) - ret <- compare_df_types_df_maker(args[[1]], class_colname=final_names[1]) - for (idx in (1+seq_len(length(args) - 1))) { - ret <- - merge( - ret, - compare_df_types_df_maker(args[[idx]], class_colname=final_names[idx]), - by="column_name", - all=TRUE - ) + if (any(unlist(final_names) %in% "column_name")) { + stop("None of the input ... argument names or list names may be `column_name`.") } + ret <- compare_df_types_df_maker(args, class_colname=final_names) if (return == "all" | ncol(ret) == 2) { if (return != "all") { warning("Only one data.frame provided, so all its classes are provided.") @@ -94,25 +141,49 @@ compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_che #' This is the workhorse for making a data.frame description used by #' compare_df_types -#' @param x The data.frame +#' @param x The data.frame or list of data.frames #' @param class_colname The name for the column-name-defining column #' @return A 2-column data.frame with the first column naming all the columns of #' \code{x} and the second column (named after the value in #' \code{class_colname}) defining the classes using #' \code{compare_df_types_class_detect()}. #' @noRd -compare_df_types_df_maker <- function(x, class_colname="class") { +compare_df_types_df_maker <- function(x, class_colname="class") + UseMethod("compare_df_types_df_maker") + +compare_df_types_df_maker.data.frame <- function(x, class_colname="class") { if (class_colname == "column_name") { stop('`class_colname` cannot be "column_name"') } + if (ncol(x) == 0) { + warning(class_colname, " has zero columns and will not appear in output.") + ret <- data.frame(column_name=character(0), stringsAsFactors=FALSE) + } else { + ret <- + data.frame( + column_name=names(x), + X=sapply(X=x, FUN=compare_df_types_class_detect), + stringsAsFactors=FALSE + ) + names(ret)[2] <- class_colname + } + ret +} + +compare_df_types_df_maker.list <- function(x, class_colname="class") { + if (length(class_colname) != length(x)) { + stop("`x` and `class_colname` must be the same length.") + } else if (any(class_colname == "column_name")) { + stop('`class_colname` cannot be "column_name"') + } ret <- - data.frame( - column_name=names(x), - X=sapply(X=x, FUN=compare_df_types_class_detect), - stringsAsFactors=FALSE + lapply( + X=seq_along(x), + FUN=function(idx) { + compare_df_types_df_maker(x=x[[idx]], class_colname=class_colname[[idx]]) + } ) - names(ret)[2] <- class_colname - ret + Reduce(f=function(x, y) {merge(x, y, by="column_name", all=TRUE)}, x=ret) } #' Will row binding succeed? diff --git a/man/compare_df_types.Rd b/man/compare_df_types.Rd index e6743f61..d786da36 100644 --- a/man/compare_df_types.Rd +++ b/man/compare_df_types.Rd @@ -9,25 +9,28 @@ compare_df_types(..., return = c("all", "match", "mismatch"), bind_check = c("rbind", "bind_rows")) } \arguments{ -\item{...}{data.frames or similar objects. The values may optionally be -named arguments; if named, the output column will be the name; if not -named, the output columne will be the data.frame name (see examples -section).} +\item{...}{data.frames or similar objects (such as tibbles), or lists of +data.frames (or similar objects). The values may optionally be named +arguments; if named, the output column will be the name; if not named, the +output columne will be the data.frame name (see examples section).} -\item{return}{Should a sumary of "all" columns be returned, only return -"match"ing columns, or only "mismatch"ing columns.} +\item{return}{Should a summary of "all" columns be returned, only return +"match"ing columns, or only "mismatch"ing columns?} -\item{bind_check}{What method of binding shoudl be used to determine matches? +\item{bind_check}{What method of binding should be used to determine matches? With "rbind", columns missing from a data.frame would be considered a -mismatch; with "bind_rows" columns missing from a data.frame would be -considered a match (as in \code{dplyr::bind_rows()}.} +mismatch (as in \code{base::rbind()}; with "bind_rows", columns missing +from a data.frame would be considered a match (as in +\code{dplyr::bind_rows()}.} } \value{ -A data.frame with a column named "column_name" and other columns - named after the input data.frames' column names, and then one column per - data.frame (named after the input data.frame). The rows within the - data.frame-named columns are descriptions of the classes of the data within - the columns (generated by \code{compare_df_types_class_detect}). +A data.frame with a column named "column_name" with a value named + after the input data.frames' column names, and then one column per + data.frame (named after the input data.frame). If more than one input + would have the same column name, the column naming is from + \code{base::merge()} and may differ from expected naming. The rows within + the data.frame-named columns are descriptions of the classes of the data + within the columns (generated by \code{compare_df_types_class_detect}). } \description{ Generate a comparison of data.frames (or similar objects) that indicates if @@ -41,6 +44,8 @@ Due to the returned "column_name" column, no input data.frame may be compare_df_types(data.frame(A=1), data.frame(B=2)) # user-defined names compare_df_types(dfA=data.frame(A=1), dfB=data.frame(B=2)) +# a combinatino of list and data.frame input +compare_df_types(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) } \seealso{ Other Data frame type comparison: \code{\link{compare_df_types_class_detect}}, diff --git a/man/compare_df_types_success.Rd b/man/compare_df_types_success.Rd index efb01cd9..a15a95a0 100644 --- a/man/compare_df_types_success.Rd +++ b/man/compare_df_types_success.Rd @@ -8,18 +8,19 @@ compare_df_types_success(..., return = "mismatch", bind_check = c("rbind", "bind_rows"), verbose = TRUE) } \arguments{ -\item{...}{data.frames or similar objects. The values may optionally be -named arguments; if named, the output column will be the name; if not -named, the output columne will be the data.frame name (see examples -section).} +\item{...}{data.frames or similar objects (such as tibbles), or lists of +data.frames (or similar objects). The values may optionally be named +arguments; if named, the output column will be the name; if not named, the +output columne will be the data.frame name (see examples section).} -\item{return}{Should a sumary of "all" columns be returned, only return -"match"ing columns, or only "mismatch"ing columns.} +\item{return}{Should a summary of "all" columns be returned, only return +"match"ing columns, or only "mismatch"ing columns?} -\item{bind_check}{What method of binding shoudl be used to determine matches? +\item{bind_check}{What method of binding should be used to determine matches? With "rbind", columns missing from a data.frame would be considered a -mismatch; with "bind_rows" columns missing from a data.frame would be -considered a match (as in \code{dplyr::bind_rows()}.} +mismatch (as in \code{base::rbind()}; with "bind_rows", columns missing +from a data.frame would be considered a match (as in +\code{dplyr::bind_rows()}.} \item{verbose}{Print the mismatching columns if binding will fail.} } diff --git a/tests/testthat/test-compare_df_types.R b/tests/testthat/test-compare_df_types.R index 58082ec0..e9c5d6b6 100644 --- a/tests/testthat/test-compare_df_types.R +++ b/tests/testthat/test-compare_df_types.R @@ -1,6 +1,5 @@ context("compare_df_types") - test_that("data.frame comparison works", { # Names are intentionally not pretty to make it easier to see the source. # These data.frames are not typically used, and if the input name is needed, @@ -116,6 +115,32 @@ test_that("data.frame comparison works", { ), info="bind_rows output skips NA" ) + + expect_warning( + expect_equal( + compare_df_types(data.frame()), + data.frame( + column_name=character(0), + stringsAsFactors=FALSE + ) + ), + regexp="data.frame() has zero columns and will not appear in output.", + fixed=TRUE, + info="empty data.frames by themselves work" + ) + expect_warning( + expect_equal( + compare_df_types(foo=data.frame(), bar=data.frame(A=1)), + data.frame( + column_name="A", + bar="numeric", + stringsAsFactors=FALSE + ) + ), + regexp="foo has zero columns and will not appear in output.", + fixed=TRUE, + info="empty data.frames with other inputs work" + ) }) test_that("class detection works", { @@ -145,3 +170,112 @@ test_that("boolean df comparison works", { expect_silent(expect_false(compare_df_types_success(data.frame(A=1), data.frame(B=2), verbose=FALSE))) expect_true(compare_df_types_success(data.frame(A=1), data.frame(B=2), bind_check="bind_rows")) }) + +test_that("list inputs to compare_df_types give appropriate errors", { + expect_error( + compare_df_types(list("A")), + regexp="List inputs must be lists of data.frames. List input number 1 is not a list of data.frames.", + fixed=TRUE + ) + expect_error( + compare_df_types(data.frame(), list("A")), + regexp="List inputs must be lists of data.frames. List input number 2 is not a list of data.frames.", + fixed=TRUE + ) + expect_error( + compare_df_types(list("A"), list("A")), + regexp="List inputs must be lists of data.frames. List input numbers 1, 2 are not lists of data.frames.", + fixed=TRUE + ) + expect_error( + compare_df_types(list("A"), list("A"), list("A"), list("A"), list("A"), list("A")), + regexp="List inputs must be lists of data.frames. List input numbers 1, 2, 3, 4, 5, ... are not lists of data.frames.", + fixed=TRUE + ) + expect_error( + compare_df_types(list(column_name=data.frame())), + regexp="None of the input ... argument names or list names may be `column_name`.", + fixed=TRUE + ) +}) + +test_that("list inputs to compare_df_types work as expected", { + expect_warning( + expect_equal( + compare_df_types( + list(foo=data.frame(), bar=data.frame(A=1, B=2)), + baz=data.frame(A=2, C=3) + ), + data.frame( + column_name=c("A", "B", "C"), + bar=c("numeric", "numeric", NA_character_), + baz=c("numeric", NA_character_, "numeric"), + stringsAsFactors=FALSE + ) + ), + regexp="foo has zero columns and will not appear in output.", + info="empty data.frame with other data.frames" + ) + expect_equal( + compare_df_types( + list(foo=data.frame(A=1), bar=data.frame(A=1, B=2)), + baz=data.frame(A=2, C=3) + ), + data.frame( + column_name=c("A", "B", "C"), + foo=c("numeric", NA_character_, NA_character_), + bar=c("numeric", "numeric", NA_character_), + baz=c("numeric", NA_character_, "numeric"), + stringsAsFactors=FALSE + ) + ) + # Naming complexity + expect_equal( + compare_df_types( + list(data.frame(A=1), bar=data.frame(A=1, B=2)), + baz=data.frame(A=2, C=3) + ), + setNames( + data.frame( + column_name=c("A", "B", "C"), + foo=c("numeric", NA_character_, NA_character_), + bar=c("numeric", "numeric", NA_character_), + baz=c("numeric", NA_character_, "numeric"), + stringsAsFactors=FALSE + ), + c("column_name", "list(data.frame(A = 1), bar = data.frame(A = 1, B = 2))_1", "bar", "baz") + ) + ) + expect_equal( + compare_df_types( + foo=list(data.frame(A=1), bar=data.frame(A=1, B=2)), + baz=data.frame(A=2, C=3) + ), + setNames( + data.frame( + column_name=c("A", "B", "C"), + foo=c("numeric", NA_character_, NA_character_), + bar=c("numeric", "numeric", NA_character_), + baz=c("numeric", NA_character_, "numeric"), + stringsAsFactors=FALSE + ), + c("column_name", "foo_1", "bar", "baz") + ) + ) + expect_equal( + compare_df_types( + foo=list(data.frame(A=1), data.frame(A=1, B=2)), + baz=data.frame(A=2, C=3) + ), + setNames( + data.frame( + column_name=c("A", "B", "C"), + foo=c("numeric", NA_character_, NA_character_), + bar=c("numeric", "numeric", NA_character_), + baz=c("numeric", NA_character_, "numeric"), + stringsAsFactors=FALSE + ), + c("column_name", "foo_1", "foo_2", "baz") + ) + ) +}) From fc3aa8279f14d84e22e5ff64de44c2aa17d4f13e Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Wed, 17 Apr 2019 16:21:00 -0400 Subject: [PATCH 3/6] Address code review comments --- NAMESPACE | 10 +- NEWS.md | 2 +- R/{compare_df_types.R => compare_df_cols.R} | 143 ++++++++++-------- man/compare_df_cols.Rd | 66 ++++++++ man/compare_df_cols_same.Rd | 44 ++++++ man/compare_df_types.Rd | 54 ------- man/compare_df_types_class_detect.Rd | 43 ------ man/compare_df_types_success.Rd | 44 ------ man/describe_class.Rd | 51 +++++++ ...pare_df_types.R => test-compare_df_cols.R} | 109 ++++++++----- 10 files changed, 323 insertions(+), 243 deletions(-) rename R/{compare_df_types.R => compare_df_cols.R} (54%) create mode 100644 man/compare_df_cols.Rd create mode 100644 man/compare_df_cols_same.Rd delete mode 100644 man/compare_df_types.Rd delete mode 100644 man/compare_df_types_class_detect.Rd delete mode 100644 man/compare_df_types_success.Rd create mode 100644 man/describe_class.Rd rename tests/testthat/{test-compare_df_types.R => test-compare_df_cols.R} (68%) diff --git a/NAMESPACE b/NAMESPACE index cf280c11..2afb6d0e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,10 +4,10 @@ S3method(chisq.test,default) S3method(chisq.test,tabyl) S3method(clean_names,default) S3method(clean_names,sf) -S3method(compare_df_types_class_detect,default) -S3method(compare_df_types_class_detect,factor) S3method(crosstab,data.frame) S3method(crosstab,default) +S3method(describe_class,default) +S3method(describe_class,factor) S3method(fisher.test,default) S3method(fisher.test,tabyl) S3method(print,tabyl) @@ -26,11 +26,11 @@ export(adorn_totals) export(as_tabyl) export(chisq.test) export(clean_names) -export(compare_df_types) -export(compare_df_types_class_detect) -export(compare_df_types_success) +export(compare_df_cols) +export(compare_df_cols_same) export(convert_to_NA) export(crosstab) +export(describe_class) export(excel_numeric_to_date) export(fisher.test) export(get_dupes) diff --git a/NEWS.md b/NEWS.md index 859153ec..ac2bb256 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,7 @@ This new function can be supplied as a value for the `.name_repair` argument of Two new function `janitor::chisq.test()` and `janitor::fisher.test()` allow to apply their `stats` equivalent to two-way tabyl objects. -The new function `compare_df_types()` allows checking if data.frame (or similar object) row binding will succeed and reports on specific columns that will or will not succeed. A companion function `compare_df_types_success()` gives a TRUE/FALSE result (simplifying testing if the bind will work in code), and `compare_df_types_class_detect()` is a generic that allows overriding defaults if binding will work for specific classes (#50, thanks to **@billdenney** for the feature.) +The new function `compare_df_cols()` allows checking if a combination of data.frames, tibbles, or lists of data.frames/tibbles have columns with the same classes, and reports on specific columns that are or are not similar. A companion function `compare_df_cols_same()` gives a TRUE/FALSE result indicating if the columns are the same (and therefore bindable, though FALSE is not definitive that binding will fail), and `describe_class()` describes the class to make differences between data.frames clear at a glance (#50, thanks to **@billdenney** for the feature.) ## Minor features diff --git a/R/compare_df_types.R b/R/compare_df_cols.R similarity index 54% rename from R/compare_df_types.R rename to R/compare_df_cols.R index 05bc5663..177bd9c9 100644 --- a/R/compare_df_types.R +++ b/R/compare_df_cols.R @@ -4,36 +4,47 @@ #' @details Due to the returned "column_name" column, no input data.frame may be #' named "column_name". #' -#' @param ... data.frames or similar objects (such as tibbles), or lists of -#' data.frames (or similar objects). The values may optionally be named -#' arguments; if named, the output column will be the name; if not named, the -#' output columne will be the data.frame name (see examples section). +#' The \code{strict_description} argument is most typically used to understand +#' if factor levels match or are bindable. Factors are typically bindable, +#' but the behavior of what happens when they bind differs based on the +#' binding method ("rbind" or "bind_rows"). Even when +#' \code{strict_description} is \code{FALSE}, data.frames may still bind +#' because some classes (like factors and characters) can bind even if they +#' appear to differ. +#' +#' @param ... A combination of data.frames, tibbles, and lists of +#' data.frames/tibbles. The values may optionally be named arguments; if +#' named, the output column will be the name; if not named, the output columne +#' will be the data.frame name (see examples section). #' @param return Should a summary of "all" columns be returned, only return #' "match"ing columns, or only "mismatch"ing columns? -#' @param bind_check What method of binding should be used to determine matches? -#' With "rbind", columns missing from a data.frame would be considered a -#' mismatch (as in \code{base::rbind()}; with "bind_rows", columns missing -#' from a data.frame would be considered a match (as in +#' @param bind_method What method of binding should be used to determine +#' matches? With "rbind", columns missing from a data.frame would be +#' considered a mismatch (as in \code{base::rbind()}; with "bind_rows", +#' columns missing from a data.frame would be considered a match (as in #' \code{dplyr::bind_rows()}. +#' @param strict_description Passed to \code{describe_class}. Also, see the +#' Details section. #' @return A data.frame with a column named "column_name" with a value named #' after the input data.frames' column names, and then one column per -#' data.frame (named after the input data.frame). If more than one input -#' would have the same column name, the column naming is from -#' \code{base::merge()} and may differ from expected naming. The rows within -#' the data.frame-named columns are descriptions of the classes of the data -#' within the columns (generated by \code{compare_df_types_class_detect}). +#' data.frame (named after the input data.frame). If more than one input has +#' the same column name, the column naming will have suffixes defined by +#' sequential use of \code{base::merge()} and may differ from expected naming. +#' The rows within the data.frame-named columns are descriptions of the +#' classes of the data within the columns (generated by +#' \code{describe_class}). #' @examples -#' compare_df_types(data.frame(A=1), data.frame(B=2)) +#' compare_df_cols(data.frame(A=1), data.frame(B=2)) #' # user-defined names -#' compare_df_types(dfA=data.frame(A=1), dfB=data.frame(B=2)) -#' # a combinatino of list and data.frame input -#' compare_df_types(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) +#' compare_df_cols(dfA=data.frame(A=1), dfB=data.frame(B=2)) +#' # a combination of list and data.frame input +#' compare_df_cols(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) #' @family Data frame type comparison #' @export -compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_check=c("rbind", "bind_rows")) { +compare_df_cols <- function(..., return=c("all", "match", "mismatch"), bind_method=c("rbind", "bind_rows"), strict_description=FALSE) { # Input checking return <- match.arg(return) - bind_check <- match.arg(bind_check) + bind_method <- match.arg(bind_method) args <- list(...) mask_input_data_frame <- sapply(X=args, FUN=is.data.frame) mask_input_list <- sapply(X=args, FUN=is.list) & !mask_input_data_frame @@ -68,11 +79,8 @@ compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_che # Generate and check column names direct_names <- names(args) - indirect_names <- - setdiff( - as.character(match.call(expand.dots=TRUE)), - as.character(match.call(expand.dots=FALSE)) - ) + indirect_names <- as.character(match.call(expand.dots=TRUE)) + indirect_names <- indirect_names[!(indirect_names %in% as.character(match.call(expand.dots=FALSE)))] if (is.null(direct_names)) { final_names <- indirect_names } else { @@ -101,7 +109,7 @@ compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_che if (any(unlist(final_names) %in% "column_name")) { stop("None of the input ... argument names or list names may be `column_name`.") } - ret <- compare_df_types_df_maker(args, class_colname=final_names) + ret <- compare_df_cols_df_maker(args, class_colname=final_names, strict_description=strict_description) if (return == "all" | ncol(ret) == 2) { if (return != "all") { warning("Only one data.frame provided, so all its classes are provided.") @@ -109,8 +117,8 @@ compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_che rownames(ret) <- NULL ret } else { - # Is this the best way to check for all row values to be equal? - bind_check_fun <- + # Choose which way to test if the rows are bindable (NA matches or not). + bind_method_fun <- list( rbind=function(idx) { all(unlist(ret[idx,3:ncol(ret)]) %in% ret[idx,2]) @@ -126,7 +134,7 @@ compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_che mask_match <- sapply( X=seq_len(nrow(ret)), - FUN=bind_check_fun[[bind_check]] + FUN=bind_method_fun[[bind_method]] ) ret <- if (return == "match") { @@ -140,18 +148,19 @@ compare_df_types <- function(..., return=c("all", "match", "mismatch"), bind_che } #' This is the workhorse for making a data.frame description used by -#' compare_df_types +#' compare_df_cols #' @param x The data.frame or list of data.frames #' @param class_colname The name for the column-name-defining column +#' @param strict_description Passed to \code{describe_class} #' @return A 2-column data.frame with the first column naming all the columns of #' \code{x} and the second column (named after the value in #' \code{class_colname}) defining the classes using -#' \code{compare_df_types_class_detect()}. +#' \code{describe_class()}. #' @noRd -compare_df_types_df_maker <- function(x, class_colname="class") - UseMethod("compare_df_types_df_maker") +compare_df_cols_df_maker <- function(x, class_colname="class", strict_description) + UseMethod("compare_df_cols_df_maker") -compare_df_types_df_maker.data.frame <- function(x, class_colname="class") { +compare_df_cols_df_maker.data.frame <- function(x, class_colname="class", strict_description) { if (class_colname == "column_name") { stop('`class_colname` cannot be "column_name"') } @@ -162,7 +171,7 @@ compare_df_types_df_maker.data.frame <- function(x, class_colname="class") { ret <- data.frame( column_name=names(x), - X=sapply(X=x, FUN=compare_df_types_class_detect), + X=sapply(X=x, FUN=describe_class, strict_description=strict_description), stringsAsFactors=FALSE ) names(ret)[2] <- class_colname @@ -170,7 +179,7 @@ compare_df_types_df_maker.data.frame <- function(x, class_colname="class") { ret } -compare_df_types_df_maker.list <- function(x, class_colname="class") { +compare_df_cols_df_maker.list <- function(x, class_colname="class", strict_description=strict_description) { if (length(class_colname) != length(x)) { stop("`x` and `class_colname` must be the same length.") } else if (any(class_colname == "column_name")) { @@ -180,28 +189,29 @@ compare_df_types_df_maker.list <- function(x, class_colname="class") { lapply( X=seq_along(x), FUN=function(idx) { - compare_df_types_df_maker(x=x[[idx]], class_colname=class_colname[[idx]]) + compare_df_cols_df_maker(x=x[[idx]], class_colname=class_colname[[idx]], strict_description=strict_description) } ) Reduce(f=function(x, y) {merge(x, y, by="column_name", all=TRUE)}, x=ret) } -#' Will row binding succeed? -#' @inheritParams compare_df_types +#' Are the data.frames the same? +#' +#' @inheritParams compare_df_cols #' @param verbose Print the mismatching columns if binding will fail. #' @return \code{TRUE} if row binding will succeed or \code{FALSE} if it will #' fail. #' @family Data frame type comparison #' @examples -#' compare_df_types_success(data.frame(A=1), data.frame(A=2)) -#' compare_df_types_success(data.frame(A=1), data.frame(B=2)) -#' compare_df_types_success(data.frame(A=1), data.frame(B=2), verbose=FALSE) -#' compare_df_types_success(data.frame(A=1), data.frame(B=2), bind_check="bind_rows") +#' compare_df_cols_same(data.frame(A=1), data.frame(A=2)) +#' compare_df_cols_same(data.frame(A=1), data.frame(B=2)) +#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), verbose=FALSE) +#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="bind_rows") #' @export -compare_df_types_success <- function(..., return="mismatch", bind_check=c("rbind", "bind_rows"), verbose=TRUE) { +compare_df_cols_same <- function(..., return="mismatch", bind_method=c("rbind", "bind_rows"), verbose=TRUE) { return <- match.arg(return) - bind_check <- match.arg(bind_check) - ret <- compare_df_types(..., return=return, bind_check=bind_check) + bind_method <- match.arg(bind_method) + ret <- compare_df_cols(..., return=return, bind_method=bind_method) if (nrow(ret) & verbose) { print(ret) } @@ -210,36 +220,47 @@ compare_df_types_success <- function(..., return="mismatch", bind_check=c("rbind #' Describe the class(es) of an object #' -#' @details An S3 generic method can be written for -#' \code{compare_df_types_class_detect()} for other types that may need more -#' definition than the default method. +#' @details For package developers, an S3 generic method can be written for +#' \code{describe_class()} for custom classes that may need more definition +#' than the default method. #' #' @param x The object to describe +#' @param strict_description Should the #' @return A character scalar describing the class(es) of an object where if the #' scalar will match, columns in a data.frame (or similar object) should bind #' together without issue. #' @family Data frame type comparison +#' @examples +#' describe_class(1) +#' describe_class(factor("A")) +#' describe_class(ordered(c("A", "B"))) +#' describe_class(ordered(c("A", "B")), strict_description=FALSE) #' @export -compare_df_types_class_detect <- function(x) { - UseMethod("compare_df_types_class_detect") +describe_class <- function(x, strict_description=TRUE) { + UseMethod("describe_class") } -#' @describeIn compare_df_types_class_detect Describe factors with their levels +#' @describeIn describe_class Describe factors with their levels #' and if they are ordered. #' @export -compare_df_types_class_detect.factor <- function(x) { - all_classes <- class(x) - all_levels <- levels(x) - level_text <- sprintf("levels=c(%s)", paste('"', levels(x), '"', sep="", collapse=", ")) - factor_text <- sprintf("factor(%s)", level_text) - mask_factor <- class(x) == "factor" - all_classes[mask_factor] <- factor_text - paste(all_classes, collapse=", ") +describe_class.factor <- function(x, strict_description=TRUE) { + if (strict_description) { + all_classes <- class(x) + all_levels <- levels(x) + level_text <- sprintf("levels=c(%s)", paste('"', levels(x), '"', sep="", collapse=", ")) + factor_text <- sprintf("factor(%s)", level_text) + mask_factor <- class(x) == "factor" + all_classes[mask_factor] <- factor_text + paste(all_classes, collapse=", ") + } else { + all_classes <- setdiff(class(x), "ordered") + paste(all_classes, collapse=", ") + } } -#' @describeIn compare_df_types_class_detect List all classes of an object. +#' @describeIn describe_class List all classes of an object. #' @export -compare_df_types_class_detect.default <- function(x) { +describe_class.default <- function(x, strict_description=TRUE) { all_classes <- class(x) paste(all_classes, collapse=", ") } diff --git a/man/compare_df_cols.Rd b/man/compare_df_cols.Rd new file mode 100644 index 00000000..f81add5c --- /dev/null +++ b/man/compare_df_cols.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_df_cols.R +\name{compare_df_cols} +\alias{compare_df_cols} +\title{Generate a comparison of data.frames (or similar objects) that indicates if +they will successfully bind together by rows.} +\usage{ +compare_df_cols(..., return = c("all", "match", "mismatch"), + bind_method = c("rbind", "bind_rows"), strict_description = FALSE) +} +\arguments{ +\item{...}{A combination of data.frames, tibbles, and lists of +data.frames/tibbles. The values may optionally be named arguments; if +named, the output column will be the name; if not named, the output columne +will be the data.frame name (see examples section).} + +\item{return}{Should a summary of "all" columns be returned, only return +"match"ing columns, or only "mismatch"ing columns?} + +\item{bind_method}{What method of binding should be used to determine +matches? With "rbind", columns missing from a data.frame would be +considered a mismatch (as in \code{base::rbind()}; with "bind_rows", +columns missing from a data.frame would be considered a match (as in +\code{dplyr::bind_rows()}.} + +\item{strict_description}{Passed to \code{describe_class}. Also, see the +Details section.} +} +\value{ +A data.frame with a column named "column_name" with a value named + after the input data.frames' column names, and then one column per + data.frame (named after the input data.frame). If more than one input has + the same column name, the column naming will have suffixes defined by + sequential use of \code{base::merge()} and may differ from expected naming. + The rows within the data.frame-named columns are descriptions of the + classes of the data within the columns (generated by + \code{describe_class}). +} +\description{ +Generate a comparison of data.frames (or similar objects) that indicates if +they will successfully bind together by rows. +} +\details{ +Due to the returned "column_name" column, no input data.frame may be + named "column_name". + + The \code{strict_description} argument is most typically used to understand + if factor levels match or are bindable. Factors are typically bindable, + but the behavior of what happens when they bind differs based on the + binding method ("rbind" or "bind_rows"). Even when + \code{strict_description} is \code{FALSE}, data.frames may still bind + because some classes (like factors and characters) can bind even if they + appear to differ. +} +\examples{ +compare_df_cols(data.frame(A=1), data.frame(B=2)) +# user-defined names +compare_df_cols(dfA=data.frame(A=1), dfB=data.frame(B=2)) +# a combination of list and data.frame input +compare_df_cols(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) +} +\seealso{ +Other Data frame type comparison: \code{\link{compare_df_cols_same}}, + \code{\link{describe_class}} +} +\concept{Data frame type comparison} diff --git a/man/compare_df_cols_same.Rd b/man/compare_df_cols_same.Rd new file mode 100644 index 00000000..4003ca3d --- /dev/null +++ b/man/compare_df_cols_same.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_df_cols.R +\name{compare_df_cols_same} +\alias{compare_df_cols_same} +\title{Will row binding succeed?} +\usage{ +compare_df_cols_same(..., return = "mismatch", bind_method = c("rbind", + "bind_rows"), verbose = TRUE) +} +\arguments{ +\item{...}{A combination of data.frames, tibbles, and lists of +data.frames/tibbles. The values may optionally be named arguments; if +named, the output column will be the name; if not named, the output columne +will be the data.frame name (see examples section).} + +\item{return}{Should a summary of "all" columns be returned, only return +"match"ing columns, or only "mismatch"ing columns?} + +\item{bind_method}{What method of binding should be used to determine +matches? With "rbind", columns missing from a data.frame would be +considered a mismatch (as in \code{base::rbind()}; with "bind_rows", +columns missing from a data.frame would be considered a match (as in +\code{dplyr::bind_rows()}.} + +\item{verbose}{Print the mismatching columns if binding will fail.} +} +\value{ +\code{TRUE} if row binding will succeed or \code{FALSE} if it will + fail. +} +\description{ +Will row binding succeed? +} +\examples{ +compare_df_cols_same(data.frame(A=1), data.frame(A=2)) +compare_df_cols_same(data.frame(A=1), data.frame(B=2)) +compare_df_cols_same(data.frame(A=1), data.frame(B=2), verbose=FALSE) +compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="bind_rows") +} +\seealso{ +Other Data frame type comparison: \code{\link{compare_df_cols}}, + \code{\link{describe_class}} +} +\concept{Data frame type comparison} diff --git a/man/compare_df_types.Rd b/man/compare_df_types.Rd deleted file mode 100644 index d786da36..00000000 --- a/man/compare_df_types.Rd +++ /dev/null @@ -1,54 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compare_df_types.R -\name{compare_df_types} -\alias{compare_df_types} -\title{Generate a comparison of data.frames (or similar objects) that indicates if -they will successfully bind together by rows.} -\usage{ -compare_df_types(..., return = c("all", "match", "mismatch"), - bind_check = c("rbind", "bind_rows")) -} -\arguments{ -\item{...}{data.frames or similar objects (such as tibbles), or lists of -data.frames (or similar objects). The values may optionally be named -arguments; if named, the output column will be the name; if not named, the -output columne will be the data.frame name (see examples section).} - -\item{return}{Should a summary of "all" columns be returned, only return -"match"ing columns, or only "mismatch"ing columns?} - -\item{bind_check}{What method of binding should be used to determine matches? -With "rbind", columns missing from a data.frame would be considered a -mismatch (as in \code{base::rbind()}; with "bind_rows", columns missing -from a data.frame would be considered a match (as in -\code{dplyr::bind_rows()}.} -} -\value{ -A data.frame with a column named "column_name" with a value named - after the input data.frames' column names, and then one column per - data.frame (named after the input data.frame). If more than one input - would have the same column name, the column naming is from - \code{base::merge()} and may differ from expected naming. The rows within - the data.frame-named columns are descriptions of the classes of the data - within the columns (generated by \code{compare_df_types_class_detect}). -} -\description{ -Generate a comparison of data.frames (or similar objects) that indicates if -they will successfully bind together by rows. -} -\details{ -Due to the returned "column_name" column, no input data.frame may be - named "column_name". -} -\examples{ -compare_df_types(data.frame(A=1), data.frame(B=2)) -# user-defined names -compare_df_types(dfA=data.frame(A=1), dfB=data.frame(B=2)) -# a combinatino of list and data.frame input -compare_df_types(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) -} -\seealso{ -Other Data frame type comparison: \code{\link{compare_df_types_class_detect}}, - \code{\link{compare_df_types_success}} -} -\concept{Data frame type comparison} diff --git a/man/compare_df_types_class_detect.Rd b/man/compare_df_types_class_detect.Rd deleted file mode 100644 index fd0d0d6b..00000000 --- a/man/compare_df_types_class_detect.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compare_df_types.R -\name{compare_df_types_class_detect} -\alias{compare_df_types_class_detect} -\alias{compare_df_types_class_detect.factor} -\alias{compare_df_types_class_detect.default} -\title{Describe the class(es) of an object} -\usage{ -compare_df_types_class_detect(x) - -\method{compare_df_types_class_detect}{factor}(x) - -\method{compare_df_types_class_detect}{default}(x) -} -\arguments{ -\item{x}{The object to describe} -} -\value{ -A character scalar describing the class(es) of an object where if the - scalar will match, columns in a data.frame (or similar object) should bind - together without issue. -} -\description{ -Describe the class(es) of an object -} -\details{ -An S3 generic method can be written for - \code{compare_df_types_class_detect()} for other types that may need more - definition than the default method. -} -\section{Methods (by class)}{ -\itemize{ -\item \code{factor}: Describe factors with their levels -and if they are ordered. - -\item \code{default}: List all classes of an object. -}} - -\seealso{ -Other Data frame type comparison: \code{\link{compare_df_types_success}}, - \code{\link{compare_df_types}} -} -\concept{Data frame type comparison} diff --git a/man/compare_df_types_success.Rd b/man/compare_df_types_success.Rd deleted file mode 100644 index a15a95a0..00000000 --- a/man/compare_df_types_success.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compare_df_types.R -\name{compare_df_types_success} -\alias{compare_df_types_success} -\title{Will row binding succeed?} -\usage{ -compare_df_types_success(..., return = "mismatch", - bind_check = c("rbind", "bind_rows"), verbose = TRUE) -} -\arguments{ -\item{...}{data.frames or similar objects (such as tibbles), or lists of -data.frames (or similar objects). The values may optionally be named -arguments; if named, the output column will be the name; if not named, the -output columne will be the data.frame name (see examples section).} - -\item{return}{Should a summary of "all" columns be returned, only return -"match"ing columns, or only "mismatch"ing columns?} - -\item{bind_check}{What method of binding should be used to determine matches? -With "rbind", columns missing from a data.frame would be considered a -mismatch (as in \code{base::rbind()}; with "bind_rows", columns missing -from a data.frame would be considered a match (as in -\code{dplyr::bind_rows()}.} - -\item{verbose}{Print the mismatching columns if binding will fail.} -} -\value{ -\code{TRUE} if row binding will succeed or \code{FALSE} if it will - fail. -} -\description{ -Will row binding succeed? -} -\examples{ -compare_df_types_success(data.frame(A=1), data.frame(A=2)) -compare_df_types_success(data.frame(A=1), data.frame(B=2)) -compare_df_types_success(data.frame(A=1), data.frame(B=2), verbose=FALSE) -compare_df_types_success(data.frame(A=1), data.frame(B=2), bind_check="bind_rows") -} -\seealso{ -Other Data frame type comparison: \code{\link{compare_df_types_class_detect}}, - \code{\link{compare_df_types}} -} -\concept{Data frame type comparison} diff --git a/man/describe_class.Rd b/man/describe_class.Rd new file mode 100644 index 00000000..ff027bfc --- /dev/null +++ b/man/describe_class.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_df_cols.R +\name{describe_class} +\alias{describe_class} +\alias{describe_class.factor} +\alias{describe_class.default} +\title{Describe the class(es) of an object} +\usage{ +describe_class(x, strict_description = TRUE) + +\method{describe_class}{factor}(x, strict_description = TRUE) + +\method{describe_class}{default}(x, strict_description = TRUE) +} +\arguments{ +\item{x}{The object to describe} + +\item{strict_description}{Should the} +} +\value{ +A character scalar describing the class(es) of an object where if the + scalar will match, columns in a data.frame (or similar object) should bind + together without issue. +} +\description{ +Describe the class(es) of an object +} +\details{ +For package developers, an S3 generic method can be written for + \code{describe_class()} for custom classes that may need more definition + than the default method. +} +\section{Methods (by class)}{ +\itemize{ +\item \code{factor}: Describe factors with their levels +and if they are ordered. + +\item \code{default}: List all classes of an object. +}} + +\examples{ +describe_class(1) +describe_class(factor("A")) +describe_class(ordered(c("A", "B"))) +describe_class(ordered(c("A", "B")), strict_description=FALSE) +} +\seealso{ +Other Data frame type comparison: \code{\link{compare_df_cols_same}}, + \code{\link{compare_df_cols}} +} +\concept{Data frame type comparison} diff --git a/tests/testthat/test-compare_df_types.R b/tests/testthat/test-compare_df_cols.R similarity index 68% rename from tests/testthat/test-compare_df_types.R rename to tests/testthat/test-compare_df_cols.R index e9c5d6b6..46592ead 100644 --- a/tests/testthat/test-compare_df_types.R +++ b/tests/testthat/test-compare_df_cols.R @@ -1,11 +1,11 @@ -context("compare_df_types") +context("compare_df_cols") test_that("data.frame comparison works", { # Names are intentionally not pretty to make it easier to see the source. # These data.frames are not typically used, and if the input name is needed, # it can be a named argument. expect_equal( - compare_df_types(data.frame(A=1), data.frame(B=2)), + compare_df_cols(data.frame(A=1), data.frame(B=2)), setNames( data.frame( column_name=c("A", "B"), @@ -20,7 +20,7 @@ test_that("data.frame comparison works", { info="Names are detected from unnamed input" ) expect_equal( - compare_df_types(foo=data.frame(A=1), bar=data.frame(B=2)), + compare_df_cols(foo=data.frame(A=1), bar=data.frame(B=2)), data.frame( column_name=c("A", "B"), foo=c("numeric", NA), @@ -30,7 +30,7 @@ test_that("data.frame comparison works", { info="Names can be used from the input" ) expect_equal( - compare_df_types(foo=data.frame(A=1), data.frame(B=2)), + compare_df_cols(foo=data.frame(A=1), data.frame(B=2)), setNames( data.frame( column_name=c("A", "B"), @@ -45,7 +45,7 @@ test_that("data.frame comparison works", { info="Names are detected from unnamed input and can be mixed with named arguments" ) expect_equal( - compare_df_types(foo=data.frame(A=1, B=1), bar=data.frame(B=2)), + compare_df_cols(foo=data.frame(A=1, B=1), bar=data.frame(B=2)), data.frame( column_name=c("A", "B"), foo="numeric", @@ -55,7 +55,7 @@ test_that("data.frame comparison works", { info="all output comes through when requested" ) expect_equal( - compare_df_types(foo=data.frame(A=1, B=1), bar=data.frame(B=2), return="match"), + compare_df_cols(foo=data.frame(A=1, B=1), bar=data.frame(B=2), return="match"), data.frame( column_name="B", foo="numeric", @@ -65,7 +65,7 @@ test_that("data.frame comparison works", { info="only matching output comes through when requested" ) expect_equal( - compare_df_types(foo=data.frame(A=1, B=1), bar=data.frame(B=2), return="mismatch"), + compare_df_cols(foo=data.frame(A=1, B=1), bar=data.frame(B=2), return="mismatch"), data.frame( column_name="A", foo="numeric", @@ -76,7 +76,7 @@ test_that("data.frame comparison works", { ) expect_warning( expect_equal( - compare_df_types(foo=data.frame(A=1, B=1), return="mismatch"), + compare_df_cols(foo=data.frame(A=1, B=1), return="mismatch"), data.frame( column_name=c("A", "B"), foo="numeric", @@ -87,11 +87,26 @@ test_that("data.frame comparison works", { info="A single data.frame isn't very meaningful, so the user is warned that they probably didn't do what they meant to do." ) expect_equal( - compare_df_types( + compare_df_cols( foo=data.frame(A=1, B=1, C=factor("A"), D=factor("B")), bar=data.frame(B=2, C=factor("A"), D=factor(c("A", "B"))), return="mismatch" ), + data.frame( + column_name="A", + foo="numeric", + bar=NA_character_, + stringsAsFactors=FALSE + ), + info="only mismatching output comes through when requested (and it works with something a bit more complex than numeric)" + ) + expect_equal( + compare_df_cols( + foo=data.frame(A=1, B=1, C=factor("A"), D=factor("B")), + bar=data.frame(B=2, C=factor("A"), D=factor(c("A", "B"))), + return="mismatch", + strict_description=TRUE + ), data.frame( column_name=c("A", "D"), foo=c("numeric", 'factor(levels=c("B"))'), @@ -101,11 +116,28 @@ test_that("data.frame comparison works", { info="only mismatching output comes through when requested (and it works with something a bit more complex than numeric)" ) expect_equal( - compare_df_types( + compare_df_cols( foo=data.frame(A=1, B=1, C=factor("A"), D=factor("B")), bar=data.frame(B=2, C=factor("A"), D=factor(c("A", "B"))), return="mismatch", - bind_check="bind_rows" + bind_method="bind_rows", + strict_description=FALSE + ), + data.frame( + column_name="D", + foo='factor', + bar='factor', + stringsAsFactors=FALSE + )[-1,], + info="bind_rows output skips NA" + ) + expect_equal( + compare_df_cols( + foo=data.frame(A=1, B=1, C=factor("A"), D=factor("B")), + bar=data.frame(B=2, C=factor("A"), D=factor(c("A", "B"))), + return="mismatch", + bind_method="bind_rows", + strict_description=TRUE ), data.frame( column_name="D", @@ -118,7 +150,7 @@ test_that("data.frame comparison works", { expect_warning( expect_equal( - compare_df_types(data.frame()), + compare_df_cols(data.frame()), data.frame( column_name=character(0), stringsAsFactors=FALSE @@ -130,7 +162,7 @@ test_that("data.frame comparison works", { ) expect_warning( expect_equal( - compare_df_types(foo=data.frame(), bar=data.frame(A=1)), + compare_df_cols(foo=data.frame(), bar=data.frame(A=1)), data.frame( column_name="A", bar="numeric", @@ -144,65 +176,72 @@ test_that("data.frame comparison works", { }) test_that("class detection works", { - expect_equal(compare_df_types_class_detect(5), "numeric") - expect_equal(compare_df_types_class_detect("A"), "character") + expect_equal(describe_class(5), "numeric") + expect_equal(describe_class("A"), "character") expect_equal( - compare_df_types_class_detect(as.POSIXct("2019-01-02")), "POSIXct, POSIXt", + describe_class(as.POSIXct("2019-01-02")), "POSIXct, POSIXt", info="multiple classes work" ) expect_equal( - compare_df_types_class_detect(factor("A")), + describe_class(factor("A")), 'factor(levels=c("A"))' ) expect_equal( - compare_df_types_class_detect(factor("A", ordered=TRUE)), + describe_class(factor("A", ordered=TRUE)), 'ordered, factor(levels=c("A"))' ) expect_equal( - compare_df_types_class_detect(factor(c("A", "B"), ordered=TRUE)), + describe_class(factor(c("A", "B"), ordered=TRUE)), 'ordered, factor(levels=c("A", "B"))' ) }) +test_that("class description without strict description", { + # No change with numeric + expect_equal(describe_class(5, strict_description=FALSE), "numeric") + # ordered factors don't show "ordered" or the levels + expect_equal(describe_class(ordered(c("A", "B")), strict_description=FALSE), "factor") +}) + test_that("boolean df comparison works", { - expect_true(compare_df_types_success(data.frame(A=1), data.frame(A=2))) - expect_output(expect_false(compare_df_types_success(data.frame(A=1), data.frame(B=2)))) - expect_silent(expect_false(compare_df_types_success(data.frame(A=1), data.frame(B=2), verbose=FALSE))) - expect_true(compare_df_types_success(data.frame(A=1), data.frame(B=2), bind_check="bind_rows")) + expect_true(compare_df_cols_same(data.frame(A=1), data.frame(A=2))) + expect_output(expect_false(compare_df_cols_same(data.frame(A=1), data.frame(B=2)))) + expect_silent(expect_false(compare_df_cols_same(data.frame(A=1), data.frame(B=2), verbose=FALSE))) + expect_true(compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="bind_rows")) }) -test_that("list inputs to compare_df_types give appropriate errors", { +test_that("list inputs to compare_df_cols give appropriate errors", { expect_error( - compare_df_types(list("A")), + compare_df_cols(list("A")), regexp="List inputs must be lists of data.frames. List input number 1 is not a list of data.frames.", fixed=TRUE ) expect_error( - compare_df_types(data.frame(), list("A")), + compare_df_cols(data.frame(), list("A")), regexp="List inputs must be lists of data.frames. List input number 2 is not a list of data.frames.", fixed=TRUE ) expect_error( - compare_df_types(list("A"), list("A")), + compare_df_cols(list("A"), list("A")), regexp="List inputs must be lists of data.frames. List input numbers 1, 2 are not lists of data.frames.", fixed=TRUE ) expect_error( - compare_df_types(list("A"), list("A"), list("A"), list("A"), list("A"), list("A")), + compare_df_cols(list("A"), list("A"), list("A"), list("A"), list("A"), list("A")), regexp="List inputs must be lists of data.frames. List input numbers 1, 2, 3, 4, 5, ... are not lists of data.frames.", fixed=TRUE ) expect_error( - compare_df_types(list(column_name=data.frame())), + compare_df_cols(list(column_name=data.frame())), regexp="None of the input ... argument names or list names may be `column_name`.", fixed=TRUE ) }) -test_that("list inputs to compare_df_types work as expected", { +test_that("list inputs to compare_df_cols work as expected", { expect_warning( expect_equal( - compare_df_types( + compare_df_cols( list(foo=data.frame(), bar=data.frame(A=1, B=2)), baz=data.frame(A=2, C=3) ), @@ -217,7 +256,7 @@ test_that("list inputs to compare_df_types work as expected", { info="empty data.frame with other data.frames" ) expect_equal( - compare_df_types( + compare_df_cols( list(foo=data.frame(A=1), bar=data.frame(A=1, B=2)), baz=data.frame(A=2, C=3) ), @@ -231,7 +270,7 @@ test_that("list inputs to compare_df_types work as expected", { ) # Naming complexity expect_equal( - compare_df_types( + compare_df_cols( list(data.frame(A=1), bar=data.frame(A=1, B=2)), baz=data.frame(A=2, C=3) ), @@ -247,7 +286,7 @@ test_that("list inputs to compare_df_types work as expected", { ) ) expect_equal( - compare_df_types( + compare_df_cols( foo=list(data.frame(A=1), bar=data.frame(A=1, B=2)), baz=data.frame(A=2, C=3) ), @@ -263,7 +302,7 @@ test_that("list inputs to compare_df_types work as expected", { ) ) expect_equal( - compare_df_types( + compare_df_cols( foo=list(data.frame(A=1), data.frame(A=1, B=2)), baz=data.frame(A=2, C=3) ), From a18bbd7f5e5e747e9eaba8ca4360804bc1762d73 Mon Sep 17 00:00:00 2001 From: Sam Firke Date: Thu, 18 Apr 2019 14:48:44 -0400 Subject: [PATCH 4/6] typo columne -> column --- R/compare_df_cols.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compare_df_cols.R b/R/compare_df_cols.R index 177bd9c9..360892a7 100644 --- a/R/compare_df_cols.R +++ b/R/compare_df_cols.R @@ -14,7 +14,7 @@ #' #' @param ... A combination of data.frames, tibbles, and lists of #' data.frames/tibbles. The values may optionally be named arguments; if -#' named, the output column will be the name; if not named, the output columne +#' named, the output column will be the name; if not named, the output column #' will be the data.frame name (see examples section). #' @param return Should a summary of "all" columns be returned, only return #' "match"ing columns, or only "mismatch"ing columns? From cf7a2ff89288c5cbd6cd10876db201c3e895aca1 Mon Sep 17 00:00:00 2001 From: Sam Firke Date: Fri, 19 Apr 2019 16:59:56 -0400 Subject: [PATCH 5/6] make bind_rows the default value of bind_method swapping it in for rbind, since this is a tidyverse-aligned package and my quick poll shows more peers using dplyr::bind_rows --- R/compare_df_cols.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/compare_df_cols.R b/R/compare_df_cols.R index 360892a7..f91065e5 100644 --- a/R/compare_df_cols.R +++ b/R/compare_df_cols.R @@ -7,7 +7,7 @@ #' The \code{strict_description} argument is most typically used to understand #' if factor levels match or are bindable. Factors are typically bindable, #' but the behavior of what happens when they bind differs based on the -#' binding method ("rbind" or "bind_rows"). Even when +#' binding method ("bind_rows" or "rbind"). Even when #' \code{strict_description} is \code{FALSE}, data.frames may still bind #' because some classes (like factors and characters) can bind even if they #' appear to differ. @@ -19,10 +19,10 @@ #' @param return Should a summary of "all" columns be returned, only return #' "match"ing columns, or only "mismatch"ing columns? #' @param bind_method What method of binding should be used to determine -#' matches? With "rbind", columns missing from a data.frame would be -#' considered a mismatch (as in \code{base::rbind()}; with "bind_rows", -#' columns missing from a data.frame would be considered a match (as in -#' \code{dplyr::bind_rows()}. +#' matches? With "bind_rows", columns missing from a data.frame would be +#' considered a match (as in \code{dplyr::bind_rows()}; with "rbind", columns +#' missing from a data.frame would be considered a mismatch (as in +#' \code{base::rbind()}. #' @param strict_description Passed to \code{describe_class}. Also, see the #' Details section. #' @return A data.frame with a column named "column_name" with a value named @@ -41,7 +41,7 @@ #' compare_df_cols(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) #' @family Data frame type comparison #' @export -compare_df_cols <- function(..., return=c("all", "match", "mismatch"), bind_method=c("rbind", "bind_rows"), strict_description=FALSE) { +compare_df_cols <- function(..., return=c("all", "match", "mismatch"), bind_method=c("bind_rows", "rbind"), strict_description=FALSE) { # Input checking return <- match.arg(return) bind_method <- match.arg(bind_method) @@ -206,9 +206,9 @@ compare_df_cols_df_maker.list <- function(x, class_colname="class", strict_descr #' compare_df_cols_same(data.frame(A=1), data.frame(A=2)) #' compare_df_cols_same(data.frame(A=1), data.frame(B=2)) #' compare_df_cols_same(data.frame(A=1), data.frame(B=2), verbose=FALSE) -#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="bind_rows") +#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="rbind") #' @export -compare_df_cols_same <- function(..., return="mismatch", bind_method=c("rbind", "bind_rows"), verbose=TRUE) { +compare_df_cols_same <- function(..., return="mismatch", bind_method=c("bind_rows", "rbind"), verbose=TRUE) { return <- match.arg(return) bind_method <- match.arg(bind_method) ret <- compare_df_cols(..., return=return, bind_method=bind_method) From f2bcd968cd4ee21110bfb6ac3814e149e05067be Mon Sep 17 00:00:00 2001 From: Sam Firke Date: Fri, 19 Apr 2019 17:08:40 -0400 Subject: [PATCH 6/6] re-describe new functions --- NEWS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index ac2bb256..86bc3db8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,13 +4,13 @@ The new function `make_clean_names()` takes a character vector and returns the cleaned text, with the same functionality as the existing `clean_names()`, which runs on a data.frame, manipulating its names. (#197, thanks **@tazinho** and everyone who contributed to the discussion). -This new function can be supplied as a value for the `.name_repair` argument of `as_tibble()` in the `tibble` package. For example: `as_tibble(iris, .name_repair = make_clean_names)`. +This function can be supplied as a value for the `.name_repair` argument of `as_tibble()` in the `tibble` package. For example: `as_tibble(iris, .name_repair = make_clean_names)`. `remove_empty()` now has a companion function `remove_constant()` which removes columns have a single value, optionally ignoring `NA` (#222, thanks to **@billdenney** for suggesting & implementing). -Two new function `janitor::chisq.test()` and `janitor::fisher.test()` allow to apply their `stats` equivalent to two-way tabyl objects. +Added the functions `janitor::chisq.test()` and `janitor::fisher.test()` to enable running these statistical tests from the base `stats` package on two-way `tabyl` objects. While the package loading message says the base functions are masked, the base tests still run on `table` objects. -The new function `compare_df_cols()` allows checking if a combination of data.frames, tibbles, or lists of data.frames/tibbles have columns with the same classes, and reports on specific columns that are or are not similar. A companion function `compare_df_cols_same()` gives a TRUE/FALSE result indicating if the columns are the same (and therefore bindable, though FALSE is not definitive that binding will fail), and `describe_class()` describes the class to make differences between data.frames clear at a glance (#50, thanks to **@billdenney** for the feature.) +The new function `compare_df_cols()` compares the names and classes of columns in a set of supplied data.frames or tibbles, reporting on the specific columns that are or are not similar. This is for the common use case where a set of data files should all have the same specifications but, in practice, may not. A companion function `compare_df_cols_same()` gives a `TRUE/FALSE` result indicating if the columns are the same (and therefore bindable, though FALSE is not definitive that binding will fail). The helper function `describe_class()` describes a variable's class to make differences between data.frames clear at a glance - it is used by developers in extending the `compare_df` functions to custom classes (#50, thanks to **@billdenney** for the feature.) ## Minor features