diff --git a/NEWS.md b/NEWS.md index fa384dc10..8b2d55576 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1013,4 +1013,6 @@ rowwiseDT( 20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation. +merge() now provides improved error handling for invalid column names in the by argument. When performing a join, the error messages explicitly identify the missing columns in both x and y, ensuring clarity for users. Fixes #6556. Thanks @venom1204 for the PR , + # data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md) diff --git a/R/merge.R b/R/merge.R index ab93d5498..8d04791b9 100644 --- a/R/merge.R +++ b/R/merge.R @@ -29,16 +29,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL ## set up 'by'/'by.x'/'by.y' if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) ) - stopf("`by.x` and `by.y` must be of same length.") + stopf("by.x and by.y must be of same length.") if (!missing(by) && !missing(by.x)) - warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") + warningf("Supplied both by and by.x/by.y. by argument will be ignored.") if (!is.null(by.x)) { if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y)) - stopf("A non-empty vector of column names is required for `by.x` and `by.y`.") + stopf("A non-empty vector of column names is required for by.x and by.y.") if (!all(by.x %chin% nm_x)) - stopf("Elements listed in `by.x` must be valid column names in x.") + stopf("Elements listed in by.x must be valid column names in x.") if (!all(by.y %chin% nm_y)) - stopf("Elements listed in `by.y` must be valid column names in y.") + stopf("Elements listed in by.y must be valid column names in y.") by = by.x names(by) = by.y } else { @@ -49,9 +49,20 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!length(by)) by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) - stopf("A non-empty vector of column names for `by` is required.") - if (!all(by %chin% intersect(nm_x, nm_y))) - stopf("Elements listed in `by` must be valid column names in x and y") + stopf("A non-empty vector of column names for by is required.") + + ## Updated Error Handling Section + missing_in_x = setdiff(by, nm_x) + missing_in_y = setdiff(by, nm_y) + if (length(missing_in_x) > 0 || length(missing_in_y) > 0) { + error_msg = "Columns listed in by must be valid column names in both data.tables.\n" + if (length(missing_in_x) > 0) + error_msg = paste0(error_msg, sprintf("✖ Missing in x: %s\n", paste(missing_in_x, collapse = ", "))) + if (length(missing_in_y) > 0) + error_msg = paste0(error_msg, sprintf("✖ Missing in y: %s", paste(missing_in_y, collapse = ", "))) + stopf(error_msg) + } + by = unname(by) by.x = by.y = by } @@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } # Throw warning if there are duplicate column names in 'dt' (i.e. if - # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) + # suffixes=c("",""), to match behaviour in base:::merge.data.frame) resultdupnames = names(dt)[duplicated(names(dt))] if (length(resultdupnames)) { warningf("column names %s are duplicated in the result", brackify(resultdupnames)) @@ -119,3 +130,6 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL setattr(dt, "class", class_x) dt } + + + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 657478c61..fe2273e29 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8566,19 +8566,20 @@ DT1 = data.table(id1 = c("c", "a", "b", "b", "b", "c"), DT2 = data.table(id1=c("c", "w", "b"), val=50:52) test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla")) -# warn when merge empty data.table #597 +# warn when merge empty data.table #597 #6556 DT0 = data.table(NULL) DT1 = data.table(a=1) test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a")) test(1601.2, merge(DT1, DT0, by="a"), warning="Input data.table 'y' has no columns.", - error="Elements listed in `by`") + error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in y: a") test(1601.3, merge(DT0, DT1, by="a"), warning="Input data.table 'x' has no columns.", - error="Elements listed in `by`") + error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a") test(1601.4, merge(DT0, DT0, by="a"), warning="Neither of the input data.tables to join have columns.", - error="Elements listed in `by`") + error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a\n✖ Missing in y: a") + # fix for #1549 d1 <- data.table(v1=1:2,x=x) @@ -13530,13 +13531,14 @@ test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'), data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'), warning = 'Supplied both.*argument will be ignored') test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'), - error = 'Elements listed in `by.x`') + error = "Elements listed in by.x must be valid column names in x.") test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'), - error = 'Elements listed in `by.y`') + error = "Elements listed in by.y must be valid column names in y.") test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183 test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge() test(1962.021, merge(DT1, DT2, by = 'z'), - error = 'must be valid column names in x and y') + error = "Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: z\n✖ Missing in y: z") + ## frank.R x = c(1, 1, 2, 5, 4, 3, 4, NA, 6) @@ -16910,14 +16912,14 @@ if (.Platform$OS.type=="windows") local({ # test back to English (the argument order is back to 1,c,2,1) test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") -# Attempting to join on character(0) shouldn't crash R A = data.table(A='a') B = data.table(B='b') test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector") -test(2145.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.") -test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required") +test(2145.2, merge(A, B, by=character(0)), error = "A non-empty vector of column names for by is required.") +test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "A non-empty vector of column names is required.") # Also shouldn't crash when using internal functions -test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty') +test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), + error = "icols and xcols must be non-empty") # nrow(i)==0 by-join, #4364 (broke in dev 1.12.9) d0 = data.table(id=integer(), n=integer()) @@ -17996,8 +17998,11 @@ test(2230.3, setDF(merge(DT, y, by="k2", incomparables=c(4,5))), merge(x, y, test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1,NA,4,5))) test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5))) test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.") -test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L), - merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments.")) +test(2230.7, + merge(DT, y, by.x = "k2", by.y = "k2"), + merge(DT, y, by = "k2")) + + # weighted.mean GForce optimized, #3977 old = options(datatable.optimize=1L)