diff --git a/.gitignore b/.gitignore index e05f2b803..9c6c65c5a 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ vignettes/plots/figures *.o *.so *.dll +*.dSYM # temp files *~ diff --git a/NEWS.md b/NEWS.md index 48f7c529e..79d223e3f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ ## NEW FEATURES +0. Using `which = NA` during join operation could eventually yield incorrect row indices, [#4303](https://github.com/Rdatatable/data.table/issues/4303). Thanks to @cbilot for reporting. + 1. `nafill()` now applies `fill=` to the front/back of the vector when `type="locf|nocb"`, [#3594](https://github.com/Rdatatable/data.table/issues/3594). Thanks to @ben519 for the feature request. It also now returns a named object based on the input names. Note that if you are considering joining and then using `nafill(...,type='locf|nocb')` afterwards, please review `roll=`/`rollends=` which should achieve the same result in one step more efficiently. `nafill()` is for when filling-while-joining (i.e. `roll=`/`rollends=`/`nomatch=`) cannot be applied. 2. `mean(na.rm=TRUE)` by group is now GForce optimized, [#4849](https://github.com/Rdatatable/data.table/issues/4849). Thanks to the [h2oai/db-benchmark](https://github.com/h2oai/db-benchmark) project for spotting this issue. The 1 billion row example in the issue shows 48s reduced to 14s. The optimization also applies to type `integer64` resulting in a difference to the `bit64::mean.integer64` method: `data.table` returns a `double` result whereas `bit64` rounds the mean to the nearest integer. diff --git a/R/data.table.R b/R/data.table.R index 801482147..f4b681ed4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -518,7 +518,7 @@ replace_dot_alias = function(e) { # If using secondary key of x, f__ will refer to xo if (is.na(which)) { w = if (notjoin) f__!=0L else is.na(f__) - return( if (length(xo)) fsort(xo[w], internal=TRUE) else which(w) ) + return( if (length(xo) && notjoin) fsort(xo[w], internal=TRUE) else which(w) ) } if (mult=="all") { # is by=.EACHI along with non-equi join? diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8eeb8f7ee..ad71a0b2b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18111,3 +18111,16 @@ test(2238.9, NA %notin% c(1:5, NA), FALSE) # shift actionable error on matrix input #5287 test(2239.1, shift(matrix(1:10, ncol = 1)), error="consider wrapping") + +# which = NA yields incorrect results #4303 +customers = data.table(ID = c( + 108924851L, 105257553L, 118054200L, 108365953L, + 116642294L, 100419961L, 115677488L, 100405475L, + 119246064L, 100383251L +)) +orders = data.table(ID = c(105257553L)) +test(2140.1, customers[orders, on = .(ID), which = NA], integer()) +orders = data.table(ID = c(105257554L)) +test(2140.2, customers[orders, on = .(ID), which = NA], 1L) +orders = data.table(ID = c(105257554L, 108924851L)) +test(2140.3, customers[orders, on = .(ID), which = NA], 1L)