-
Notifications
You must be signed in to change notification settings - Fork 130
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* Allow data.frame row-binding comparison (Fix #50) * Allow list inputs to `compare_df_types()` * Address code review comments * typo columne -> column * make bind_rows the default value of bind_method swapping it in for rbind, since this is a tidyverse-aligned package and my quick poll shows more peers using dplyr::bind_rows * re-describe new functions
- Loading branch information
1 parent
f44b71e
commit 51aa5df
Showing
7 changed files
with
756 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
#' Generate a comparison of data.frames (or similar objects) that indicates if | ||
#' they will successfully bind together by rows. | ||
#' | ||
#' @details Due to the returned "column_name" column, no input data.frame may be | ||
#' named "column_name". | ||
#' | ||
#' The \code{strict_description} argument is most typically used to understand | ||
#' if factor levels match or are bindable. Factors are typically bindable, | ||
#' but the behavior of what happens when they bind differs based on the | ||
#' binding method ("bind_rows" or "rbind"). Even when | ||
#' \code{strict_description} is \code{FALSE}, data.frames may still bind | ||
#' because some classes (like factors and characters) can bind even if they | ||
#' appear to differ. | ||
#' | ||
#' @param ... A combination of data.frames, tibbles, and lists of | ||
#' data.frames/tibbles. The values may optionally be named arguments; if | ||
#' named, the output column will be the name; if not named, the output column | ||
#' will be the data.frame name (see examples section). | ||
#' @param return Should a summary of "all" columns be returned, only return | ||
#' "match"ing columns, or only "mismatch"ing columns? | ||
#' @param bind_method What method of binding should be used to determine | ||
#' matches? With "bind_rows", columns missing from a data.frame would be | ||
#' considered a match (as in \code{dplyr::bind_rows()}; with "rbind", columns | ||
#' missing from a data.frame would be considered a mismatch (as in | ||
#' \code{base::rbind()}. | ||
#' @param strict_description Passed to \code{describe_class}. Also, see the | ||
#' Details section. | ||
#' @return A data.frame with a column named "column_name" with a value named | ||
#' after the input data.frames' column names, and then one column per | ||
#' data.frame (named after the input data.frame). If more than one input has | ||
#' the same column name, the column naming will have suffixes defined by | ||
#' sequential use of \code{base::merge()} and may differ from expected naming. | ||
#' The rows within the data.frame-named columns are descriptions of the | ||
#' classes of the data within the columns (generated by | ||
#' \code{describe_class}). | ||
#' @examples | ||
#' compare_df_cols(data.frame(A=1), data.frame(B=2)) | ||
#' # user-defined names | ||
#' compare_df_cols(dfA=data.frame(A=1), dfB=data.frame(B=2)) | ||
#' # a combination of list and data.frame input | ||
#' compare_df_cols(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3)) | ||
#' @family Data frame type comparison | ||
#' @export | ||
compare_df_cols <- function(..., return=c("all", "match", "mismatch"), bind_method=c("bind_rows", "rbind"), strict_description=FALSE) { | ||
# Input checking | ||
return <- match.arg(return) | ||
bind_method <- match.arg(bind_method) | ||
args <- list(...) | ||
mask_input_data_frame <- sapply(X=args, FUN=is.data.frame) | ||
mask_input_list <- sapply(X=args, FUN=is.list) & !mask_input_data_frame | ||
mask_input_other <- !(mask_input_data_frame | mask_input_list) | ||
if (any(mask_input_other)) { | ||
stop("Input given with `...` must be either a data.frame or a list of data.frames.") | ||
} | ||
bad_list_inputs <- numeric(0) | ||
for (idx in which(mask_input_list)) { | ||
bad_list_inputs <- | ||
c( | ||
bad_list_inputs, | ||
if (!all(sapply(X=args[[idx]], FUN=is.data.frame))) { | ||
idx | ||
} else { | ||
numeric(0) | ||
} | ||
) | ||
} | ||
if (length(bad_list_inputs)) { | ||
stop( | ||
"List inputs must be lists of data.frames. List input ", | ||
if (length(bad_list_inputs) == 1) { | ||
paste("number", bad_list_inputs, "is not a list of data.frames.") | ||
} else if (length(bad_list_inputs) < 6) { | ||
paste("numbers", paste(bad_list_inputs, collapse=", "), "are not lists of data.frames.") | ||
} else { | ||
paste("numbers", paste(c(bad_list_inputs[1:5], "..."), collapse=", "), "are not lists of data.frames.") | ||
} | ||
) | ||
} | ||
|
||
# Generate and check column names | ||
direct_names <- names(args) | ||
indirect_names <- as.character(match.call(expand.dots=TRUE)) | ||
indirect_names <- indirect_names[!(indirect_names %in% as.character(match.call(expand.dots=FALSE)))] | ||
if (is.null(direct_names)) { | ||
final_names <- indirect_names | ||
} else { | ||
final_names <- direct_names | ||
mask_replace <- final_names %in% "" | ||
final_names[mask_replace] <- indirect_names[mask_replace] | ||
} | ||
final_names <- as.list(final_names) | ||
for (idx in which(mask_input_list)) { | ||
current_list_names <- names(args[[idx]]) | ||
final_names[[idx]] <- | ||
if (is.null(current_list_names)) { | ||
paste(final_names[[idx]], seq_along(args[[idx]]), sep="_") | ||
} else if (any(mask_unnamed_list <- current_list_names %in% "")) { | ||
current_list_names[mask_unnamed_list] <- | ||
paste( | ||
final_names[[idx]][mask_unnamed_list], | ||
seq_len(sum(mask_unnamed_list)), | ||
sep="_" | ||
) | ||
current_list_names | ||
} else { | ||
current_list_names | ||
} | ||
} | ||
if (any(unlist(final_names) %in% "column_name")) { | ||
stop("None of the input ... argument names or list names may be `column_name`.") | ||
} | ||
ret <- compare_df_cols_df_maker(args, class_colname=final_names, strict_description=strict_description) | ||
if (return == "all" | ncol(ret) == 2) { | ||
if (return != "all") { | ||
warning("Only one data.frame provided, so all its classes are provided.") | ||
} | ||
rownames(ret) <- NULL | ||
ret | ||
} else { | ||
# Choose which way to test if the rows are bindable (NA matches or not). | ||
bind_method_fun <- | ||
list( | ||
rbind=function(idx) { | ||
all(unlist(ret[idx,3:ncol(ret)]) %in% ret[idx,2]) | ||
}, | ||
bind_rows=function(idx) { | ||
all( | ||
unlist(ret[idx,3:ncol(ret)]) %in% | ||
c(NA_character_, | ||
na.omit(unlist(ret[idx,2:ncol(ret)]))[1]) | ||
) | ||
} | ||
) | ||
mask_match <- | ||
sapply( | ||
X=seq_len(nrow(ret)), | ||
FUN=bind_method_fun[[bind_method]] | ||
) | ||
ret <- | ||
if (return == "match") { | ||
ret[mask_match,] | ||
} else if (return == "mismatch") { | ||
ret[!mask_match,] | ||
} | ||
rownames(ret) <- NULL | ||
ret | ||
} | ||
} | ||
|
||
#' This is the workhorse for making a data.frame description used by | ||
#' compare_df_cols | ||
#' @param x The data.frame or list of data.frames | ||
#' @param class_colname The name for the column-name-defining column | ||
#' @param strict_description Passed to \code{describe_class} | ||
#' @return A 2-column data.frame with the first column naming all the columns of | ||
#' \code{x} and the second column (named after the value in | ||
#' \code{class_colname}) defining the classes using | ||
#' \code{describe_class()}. | ||
#' @noRd | ||
compare_df_cols_df_maker <- function(x, class_colname="class", strict_description) | ||
UseMethod("compare_df_cols_df_maker") | ||
|
||
compare_df_cols_df_maker.data.frame <- function(x, class_colname="class", strict_description) { | ||
if (class_colname == "column_name") { | ||
stop('`class_colname` cannot be "column_name"') | ||
} | ||
if (ncol(x) == 0) { | ||
warning(class_colname, " has zero columns and will not appear in output.") | ||
ret <- data.frame(column_name=character(0), stringsAsFactors=FALSE) | ||
} else { | ||
ret <- | ||
data.frame( | ||
column_name=names(x), | ||
X=sapply(X=x, FUN=describe_class, strict_description=strict_description), | ||
stringsAsFactors=FALSE | ||
) | ||
names(ret)[2] <- class_colname | ||
} | ||
ret | ||
} | ||
|
||
compare_df_cols_df_maker.list <- function(x, class_colname="class", strict_description=strict_description) { | ||
if (length(class_colname) != length(x)) { | ||
stop("`x` and `class_colname` must be the same length.") | ||
} else if (any(class_colname == "column_name")) { | ||
stop('`class_colname` cannot be "column_name"') | ||
} | ||
ret <- | ||
lapply( | ||
X=seq_along(x), | ||
FUN=function(idx) { | ||
compare_df_cols_df_maker(x=x[[idx]], class_colname=class_colname[[idx]], strict_description=strict_description) | ||
} | ||
) | ||
Reduce(f=function(x, y) {merge(x, y, by="column_name", all=TRUE)}, x=ret) | ||
} | ||
|
||
#' Are the data.frames the same? | ||
#' | ||
#' @inheritParams compare_df_cols | ||
#' @param verbose Print the mismatching columns if binding will fail. | ||
#' @return \code{TRUE} if row binding will succeed or \code{FALSE} if it will | ||
#' fail. | ||
#' @family Data frame type comparison | ||
#' @examples | ||
#' compare_df_cols_same(data.frame(A=1), data.frame(A=2)) | ||
#' compare_df_cols_same(data.frame(A=1), data.frame(B=2)) | ||
#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), verbose=FALSE) | ||
#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="rbind") | ||
#' @export | ||
compare_df_cols_same <- function(..., return="mismatch", bind_method=c("bind_rows", "rbind"), verbose=TRUE) { | ||
return <- match.arg(return) | ||
bind_method <- match.arg(bind_method) | ||
ret <- compare_df_cols(..., return=return, bind_method=bind_method) | ||
if (nrow(ret) & verbose) { | ||
print(ret) | ||
} | ||
nrow(ret) == 0 | ||
} | ||
|
||
#' Describe the class(es) of an object | ||
#' | ||
#' @details For package developers, an S3 generic method can be written for | ||
#' \code{describe_class()} for custom classes that may need more definition | ||
#' than the default method. | ||
#' | ||
#' @param x The object to describe | ||
#' @param strict_description Should the | ||
#' @return A character scalar describing the class(es) of an object where if the | ||
#' scalar will match, columns in a data.frame (or similar object) should bind | ||
#' together without issue. | ||
#' @family Data frame type comparison | ||
#' @examples | ||
#' describe_class(1) | ||
#' describe_class(factor("A")) | ||
#' describe_class(ordered(c("A", "B"))) | ||
#' describe_class(ordered(c("A", "B")), strict_description=FALSE) | ||
#' @export | ||
describe_class <- function(x, strict_description=TRUE) { | ||
UseMethod("describe_class") | ||
} | ||
|
||
#' @describeIn describe_class Describe factors with their levels | ||
#' and if they are ordered. | ||
#' @export | ||
describe_class.factor <- function(x, strict_description=TRUE) { | ||
if (strict_description) { | ||
all_classes <- class(x) | ||
all_levels <- levels(x) | ||
level_text <- sprintf("levels=c(%s)", paste('"', levels(x), '"', sep="", collapse=", ")) | ||
factor_text <- sprintf("factor(%s)", level_text) | ||
mask_factor <- class(x) == "factor" | ||
all_classes[mask_factor] <- factor_text | ||
paste(all_classes, collapse=", ") | ||
} else { | ||
all_classes <- setdiff(class(x), "ordered") | ||
paste(all_classes, collapse=", ") | ||
} | ||
} | ||
|
||
#' @describeIn describe_class List all classes of an object. | ||
#' @export | ||
describe_class.default <- function(x, strict_description=TRUE) { | ||
all_classes <- class(x) | ||
paste(all_classes, collapse=", ") | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.