Skip to content

Commit

Permalink
fixed #6556
Browse files Browse the repository at this point in the history
  • Loading branch information
venom1204 committed Dec 22, 2024
1 parent e4b0bbb commit b1c299c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 22 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -1013,4 +1013,6 @@ rowwiseDT(

20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation.

merge() now provides improved error handling for invalid column names in the by argument. When performing a join, the error messages explicitly identify the missing columns in both x and y, ensuring clarity for users. Fixes #6556. Thanks @venom1204 for the PR ,

# data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md)
32 changes: 23 additions & 9 deletions R/merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL

## set up 'by'/'by.x'/'by.y'
if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
stopf("`by.x` and `by.y` must be of same length.")
stopf("by.x and by.y must be of same length.")
if (!missing(by) && !missing(by.x))
warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
warningf("Supplied both by and by.x/by.y. by argument will be ignored.")
if (!is.null(by.x)) {
if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
stopf("A non-empty vector of column names is required for by.x and by.y.")
if (!all(by.x %chin% nm_x))
stopf("Elements listed in `by.x` must be valid column names in x.")
stopf("Elements listed in by.x must be valid column names in x.")
if (!all(by.y %chin% nm_y))
stopf("Elements listed in `by.y` must be valid column names in y.")
stopf("Elements listed in by.y must be valid column names in y.")
by = by.x
names(by) = by.y
} else {
Expand All @@ -49,9 +49,20 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
if (!length(by))
by = intersect(nm_x, nm_y)
if (length(by) == 0L || !is.character(by))
stopf("A non-empty vector of column names for `by` is required.")
if (!all(by %chin% intersect(nm_x, nm_y)))
stopf("Elements listed in `by` must be valid column names in x and y")
stopf("A non-empty vector of column names for by is required.")

## Updated Error Handling Section
missing_in_x = setdiff(by, nm_x)
missing_in_y = setdiff(by, nm_y)
if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
error_msg = "Columns listed in by must be valid column names in both data.tables.\n"
if (length(missing_in_x) > 0)

Check warning on line 59 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=59,col=36,[trailing_whitespace_linter] Remove trailing whitespace.
error_msg = paste0(error_msg, sprintf("✖ Missing in x: %s\n", paste(missing_in_x, collapse = ", ")))

Check warning on line 60 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=60,col=71,[paste_linter] toString(.) is more expressive than paste(., collapse = ", "). Note also glue::glue_collapse() and and::and() for constructing human-readable / translation-friendly lists
if (length(missing_in_y) > 0)

Check warning on line 61 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=61,col=36,[trailing_whitespace_linter] Remove trailing whitespace.
error_msg = paste0(error_msg, sprintf("✖ Missing in y: %s", paste(missing_in_y, collapse = ", ")))

Check warning on line 62 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=62,col=69,[paste_linter] toString(.) is more expressive than paste(., collapse = ", "). Note also glue::glue_collapse() and and::and() for constructing human-readable / translation-friendly lists
stopf(error_msg)
}

by = unname(by)
by.x = by.y = by
}
Expand Down Expand Up @@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
}

# Throw warning if there are duplicate column names in 'dt' (i.e. if
# `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
# suffixes=c("",""), to match behaviour in base:::merge.data.frame)
resultdupnames = names(dt)[duplicated(names(dt))]
if (length(resultdupnames)) {
warningf("column names %s are duplicated in the result", brackify(resultdupnames))
Expand All @@ -119,3 +130,6 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
setattr(dt, "class", class_x)
dt
}

Check warning on line 133 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=133,col=1,[trailing_blank_lines_linter] Remove trailing blank lines.

Check warning on line 134 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=134,col=1,[trailing_blank_lines_linter] Remove trailing blank lines.

Check warning on line 135 in R/merge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/merge.R,line=135,col=1,[trailing_blank_lines_linter] Remove trailing blank lines.
31 changes: 18 additions & 13 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -8566,19 +8566,20 @@ DT1 = data.table(id1 = c("c", "a", "b", "b", "b", "c"),
DT2 = data.table(id1=c("c", "w", "b"), val=50:52)
test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla"))

# warn when merge empty data.table #597
# warn when merge empty data.table #597 #6556
DT0 = data.table(NULL)
DT1 = data.table(a=1)
test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
test(1601.2, merge(DT1, DT0, by="a"),
warning="Input data.table 'y' has no columns.",
error="Elements listed in `by`")
error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in y: a")
test(1601.3, merge(DT0, DT1, by="a"),
warning="Input data.table 'x' has no columns.",
error="Elements listed in `by`")
error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a")
test(1601.4, merge(DT0, DT0, by="a"),
warning="Neither of the input data.tables to join have columns.",
error="Elements listed in `by`")
error="Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a\n✖ Missing in y: a")


# fix for #1549
d1 <- data.table(v1=1:2,x=x)
Expand Down Expand Up @@ -13530,13 +13531,14 @@ test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
warning = 'Supplied both.*argument will be ignored')
test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
error = 'Elements listed in `by.x`')
error = "Elements listed in by.x must be valid column names in x.")
test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
error = 'Elements listed in `by.y`')
error = "Elements listed in by.y must be valid column names in y.")
test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183
test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge()
test(1962.021, merge(DT1, DT2, by = 'z'),
error = 'must be valid column names in x and y')
error = "Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: z\n✖ Missing in y: z")


## frank.R
x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
Expand Down Expand Up @@ -16910,14 +16912,14 @@ if (.Platform$OS.type=="windows") local({
# test back to English (the argument order is back to 1,c,2,1)
test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1")

# Attempting to join on character(0) shouldn't crash R
A = data.table(A='a')
B = data.table(B='b')
test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector")
test(2145.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.")
test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required")
test(2145.2, merge(A, B, by=character(0)), error = "A non-empty vector of column names for by is required.")
test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "A non-empty vector of column names is required.")
# Also shouldn't crash when using internal functions
test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty')
test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE),
error = "icols and xcols must be non-empty")

# nrow(i)==0 by-join, #4364 (broke in dev 1.12.9)
d0 = data.table(id=integer(), n=integer())
Expand Down Expand Up @@ -17996,8 +17998,11 @@ test(2230.3, setDF(merge(DT, y, by="k2", incomparables=c(4,5))), merge(x, y,
test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1,NA,4,5)))
test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5)))
test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.")
test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L),
merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
test(2230.7,
merge(DT, y, by.x = "k2", by.y = "k2"),
merge(DT, y, by = "k2"))



# weighted.mean GForce optimized, #3977
old = options(datatable.optimize=1L)
Expand Down

0 comments on commit b1c299c

Please sign in to comment.