fixed #6556

Rdatatable · Dec 22, 2024 · b1c299c · b1c299c
1 parent e4b0bbb
commit b1c299c
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 22 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1013,4 +1013,6 @@ rowwiseDT(
 
 20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation.
 
+merge() now provides improved error handling for invalid column names in the by argument. When performing a join, the error messages explicitly identify the missing columns in both x and y, ensuring clarity for users. Fixes #6556. Thanks @venom1204 for the PR ,
+
 # data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md)
diff --git a/R/merge.R b/R/merge.R
@@ -29,16 +29,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
 
   ## set up 'by'/'by.x'/'by.y'
   if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
-    stopf("`by.x` and `by.y` must be of same length.")
+    stopf("by.x and by.y must be of same length.")
   if (!missing(by) && !missing(by.x))
-    warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
+    warningf("Supplied both by and by.x/by.y. by argument will be ignored.")
   if (!is.null(by.x)) {
     if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
-      stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
+      stopf("A non-empty vector of column names is required for by.x and by.y.")
     if (!all(by.x %chin% nm_x))
-      stopf("Elements listed in `by.x` must be valid column names in x.")
+      stopf("Elements listed in by.x must be valid column names in x.")
     if (!all(by.y %chin% nm_y))
-      stopf("Elements listed in `by.y` must be valid column names in y.")
+      stopf("Elements listed in by.y must be valid column names in y.")
     by = by.x
     names(by) = by.y
   } else {
@@ -49,9 +49,20 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
     if (!length(by))
       by = intersect(nm_x, nm_y)
     if (length(by) == 0L || !is.character(by))
-      stopf("A non-empty vector of column names for `by` is required.")
-    if (!all(by %chin% intersect(nm_x, nm_y)))
-      stopf("Elements listed in `by` must be valid column names in x and y")
+      stopf("A non-empty vector of column names for by is required.")
+
+    ## Updated Error Handling Section
+    missing_in_x = setdiff(by, nm_x)
+    missing_in_y = setdiff(by, nm_y)
+    if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
+      error_msg = "Columns listed in by must be valid column names in both data.tables.\n"
+      if (length(missing_in_x) > 0) 
+        error_msg = paste0(error_msg, sprintf("✖ Missing in x: %s\n", paste(missing_in_x, collapse = ", ")))
+      if (length(missing_in_y) > 0) 
+        error_msg = paste0(error_msg, sprintf("✖ Missing in y: %s", paste(missing_in_y, collapse = ", ")))
+      stopf(error_msg)
+    }
+
     by = unname(by)
     by.x = by.y = by
   }
@@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   }
 
   # Throw warning if there are duplicate column names in 'dt' (i.e. if
-  # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
+  # suffixes=c("",""), to match behaviour in base:::merge.data.frame)
   resultdupnames = names(dt)[duplicated(names(dt))]
   if (length(resultdupnames)) {
     warningf("column names %s are duplicated in the result", brackify(resultdupnames))
@@ -119,3 +130,6 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   setattr(dt, "class", class_x)
   dt
 }
+
+
+
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -8566,19 +8566,20 @@ DT1 = data.table(id1 = c("c", "a", "b", "b", "b", "c"),
 DT2 = data.table(id1=c("c", "w", "b"), val=50:52)
 test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla"))
 
-# warn when merge empty data.table #597
+# warn when merge empty data.table #597 #6556
 DT0 = data.table(NULL)
 DT1 = data.table(a=1)
 test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
 test(1601.2, merge(DT1, DT0, by="a"),
      warning="Input data.table 'y' has no columns.",
-     error="Elements listed in `by`")
+     error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in y: a")
 test(1601.3, merge(DT0, DT1, by="a"),
      warning="Input data.table 'x' has no columns.",
-     error="Elements listed in `by`")
+     error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a")
 test(1601.4, merge(DT0, DT0, by="a"),
      warning="Neither of the input data.tables to join have columns.",
-     error="Elements listed in `by`")
+     error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a\n✖ Missing in y: a")
+
 
 # fix for #1549
 d1 <- data.table(v1=1:2,x=x)
@@ -13530,13 +13531,14 @@ test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
      data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
      warning = 'Supplied both.*argument will be ignored')
 test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
-     error = 'Elements listed in `by.x`')
+     error = "Elements listed in by.x must be valid column names in x.")
 test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
-     error = 'Elements listed in `by.y`')
+     error = "Elements listed in by.y must be valid column names in y.")
 test(1962.0201, merge(DT1, DT2, by=character(0L)), ans)  # was error before PR#5183
 test(1962.0202, merge(DT1, DT2, by=NULL),          ans)  # test explicit NULL too as missing() could be used inside merge()
 test(1962.021, merge(DT1, DT2, by = 'z'),
-     error = 'must be valid column names in x and y')
+     error = "Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: z\n✖ Missing in y: z")
+
 
 ## frank.R
 x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
@@ -16910,14 +16912,14 @@ if (.Platform$OS.type=="windows") local({
 # test back to English (the argument order is back to 1,c,2,1)
 test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1")
 
-# Attempting to join on character(0) shouldn't crash R
 A = data.table(A='a')
 B = data.table(B='b')
 test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector")
-test(2145.2, merge(A, B, by=character(0)                                     ), error = "non-empty vector of column names for `by` is required.")
-test(2145.3, merge(A, B,                 by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required")
+test(2145.2, merge(A, B, by=character(0)), error = "A non-empty vector of column names for by is required.")
+test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "A non-empty vector of column names is required.")
 # Also shouldn't crash when using internal functions
-test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty')
+test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE),
+     error = "icols and xcols must be non-empty")
 
 # nrow(i)==0 by-join, #4364 (broke in dev 1.12.9)
 d0 = data.table(id=integer(), n=integer())
@@ -17996,8 +17998,11 @@ test(2230.3, setDF(merge(DT, y, by="k2", incomparables=c(4,5))),     merge(x, y,
 test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1,NA,4,5)))
 test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5)))
 test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.")
-test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L),
-             merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
+test(2230.7, 
+     merge(DT, y, by.x = "k2", by.y = "k2"), 
+     merge(DT, y, by = "k2"))
+
+
 
 # weighted.mean GForce optimized, #3977
 old = options(datatable.optimize=1L)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1013,4 +1013,6 @@ rowwiseDT(

		20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation.

		merge() now provides improved error handling for invalid column names in the by argument. When performing a join, the error messages explicitly identify the missing columns in both x and y, ensuring clarity for users. Fixes #6556. Thanks @venom1204 for the PR ,

		# data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md)