#1761 and #1705, regression fix + ‘x.’ prefix in by=.EACHI joins

Rdatatable · Aug 27, 2016 · 437c440 · 437c440
1 parent 8441ff7
commit 437c440
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 9 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -61,7 +61,7 @@
 
   29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389).
 
-  30. x's columns can be referred to in `j` using the prefix `x.` at all times. This is particularly useful when it is necessary to x's column that is *also a join column*. This is a patch addressing [#1615](https://github.com/Rdatatable/data.table/issues/1615).
+  30. x's columns can be referred to in `j` using the prefix `x.` at all times. This is particularly useful when it is necessary to x's column that is *also a join column*. This is a patch addressing [#1615](https://github.com/Rdatatable/data.table/issues/1615). Also closes [#1705](https://github.com/Rdatatable/data.table/issues/1705) (thanks @dbetebenner) and [#1761](https://github.com/Rdatatable/data.table/issues/1761).
 
   31. New function `fwrite` implements [#580](https://github.com/Rdatatable/data.table/issues/580). Thanks to Otto Seiskari for C code, R wrapper, manual page and extensive tests.
 

diff --git a/R/data.table.R b/R/data.table.R
@@ -793,6 +793,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     byval = NULL
     xnrow = nrow(x)
     xcols = xcolsAns = icols = icolsAns = integer()
+    xdotcols = FALSE
     othervars = character(0)
     if (missing(j)) {
         # missing(by)==TRUE was already checked above before dealing with i
@@ -817,6 +818,11 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
         }
         ansvals = chmatch(ansvars, nx)
     } else {
+        if (is.data.table(i)) {
+            idotprefix = paste0("i.", names(i))
+            xdotprefix = paste0("x.", names(x))
+        } else idotprefix = xdotprefix = character(0)
+
         # j was substituted before dealing with i so that := can set allow.cartesian=FALSE (#800) (used above in i logic)
         if (is.null(jsub)) return(NULL)
         if (is.call(jsub) && jsub[[1L]]==":=") {
@@ -1093,7 +1099,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     }
                 }
                 # fix for long standing FR/bug, #495 and #484
-                allcols = c(names(x), paste("x.",names(x),sep=""), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep="")))
+                allcols = c(names(x), xdotprefix, names(i), idotprefix)
                 if ( length(othervars <- setdiff(intersect(av, allcols), c(bynames, ansvars))) ) {
                     # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or 
                     # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., 
@@ -1103,7 +1109,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                 # .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols
             } else {
                 if (!missing(.SDcols)) warning("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.")
-                allcols = c(names(x), paste("x.",names(x),sep=""), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep="")))
+                allcols = c(names(x), xdotprefix, names(i), idotprefix)
                 ansvars = setdiff(intersect(av,allcols), bynames)
                 if (verbose) cat("Detected that j uses these columns:",if (!length(ansvars)) "<none>" else paste(ansvars,collapse=","),"\n")
                 # using a few named columns will be faster
@@ -1123,7 +1129,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     # eval(macro) column names are detected via the  if jsub[[1]]==eval switch earlier above.
                 }
                 if (length(ansvars)) othervars = ansvars # #1744 fix
-                allcols = c(names(x), paste("x.",names(x),sep=""), if (is.data.table(i)) c(names(i), paste("i.", names(i), sep="")))
+                allcols = c(names(x), xdotprefix, names(i), idotprefix)
                 ansvars = setdiff(allcols,bynames) # fix for bug #5443
                 ansvals = chmatch(ansvars, names(x))
                 if (length(othervars)) othervars = setdiff(ansvars, othervars) # #1744 fix
@@ -1245,9 +1251,10 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
             # patch for #1615. Allow 'x.' syntax. Only useful during join op when x's join col needs to be used.
             # Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] to refer to x's join column 
             # as well because x[i, col] == x[i][, col] will not be TRUE anymore..
-            xjoincols = paste("x.",names(x),sep="")
-            if ( any(xjoinvals <- ansvars %in% xjoincols))
-                w[xjoinvals] = chmatch(ansvars[xjoinvals], xjoincols)
+            if ( any(xdotprefixvals <- ansvars %in% xdotprefix)) {
+                w[xdotprefixvals] = chmatch(ansvars[xdotprefixvals], xdotprefix)
+                xdotcols = TRUE
+            }
             if (!any(wna <- is.na(w))) {
                 xcols = w
                 xcolsAns = seq_along(ansvars)
@@ -1514,6 +1521,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     if (length(xcols)) {
         #  TODO add: if (length(alloc)==nrow(x)) stop("There is no need to deep copy x in this case")
         SDenv$.SDall = .Call(CsubsetDT,x,alloc,xcols)    # must be deep copy when largest group is a subset
+        if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[seq_along(xcols)]) # now that we allow 'x.' prefix in 'j'
         SDenv$.SD = if (!length(othervars)) SDenv$.SDall else shallow(SDenv$.SDall, setdiff(ansvars, othervars))
     }
     if (nrow(SDenv$.SDall)==0L) {

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -4556,10 +4556,10 @@ test(1288.16, rbindlist(ll, fill=TRUE), error="fill=TRUE, but names of input lis
 # TO DO: TODO: think of and add more tests for rbindlist
 
 # fix for #5647
-dt <- data.table(x=1L, y=1:10)
+dt = data.table(x=1L, y=1:10)
 cp = copy(dt)
 test(1289.1, dt[,z := c(rep(NA, 5), y), by=x], cp[, z := c(rep(NA, 5), y[1:5])], warning="RHS 1 is length 15")
-dt<-data.table(x=c(1:2), y=1:10)
+dt = data.table(x=c(1:2), y=1:10)
 cp = copy(dt)
 test(1289.2, dt[, z := c(rep(NA, 5),y), by=x], cp[, z := rep(NA_integer_, 10)], warning="RHS 1 is length 10")
 
@@ -9188,6 +9188,22 @@ test(1700.1, dcast(dt, type ~ subtype, value.var = c("var1", "var2"), fun = func
     data.table(type=c("A","B"), var1_1=c("a", "d|e"), var1_2=c("b|c", "f"), 
                 var2_1=c("aa", "ee"), var2_2=c("bb|cc|dd","ff"), key="type"))
 
+# fixing regression introduced on providing functionality of 'x.' prefix in 'j' (for non-equi joins)
+A = data.table(x=c(1,1,1,2,2), y=1:5, z=5:1)
+B = data.table(x=c(2,3), val=4)
+col1 = "y"
+col2 = "x.y"
+test(1701.1, A[, .(length(x), length(y)), by=x], data.table(x=c(1,2), V1=1L, V2=c(3:2)))
+test(1701.2, A[, .(x), by=x], data.table(x=c(1,2), x=c(1,2)))
+test(1701.3, A[B, x.x, on="x"], c(2,2,NA))
+test(1701.4, A[B, x.y, on="x"], c(4:5,NA))
+test(1701.5, A[B, .(get("x"), get("x.x")), on="x"], data.table(V1=c(2,2,3), V2=c(2,2,NA)))
+test(1701.6, A[B, mget(c("x", "x.x")), on="x"], data.table(x=c(2,2,3), x.x=c(2,2,NA)))
+# 1761 fix as well 
+test(1701.6, A[B, .(x.x, get("x.x"), x.y), on="x", by=.EACHI], data.table(x=c(2,2,3), x.x=c(2,2,NA), V2=c(2,2,NA), x.y=c(4:5,NA)))
+dt = data.table(a=1L)
+test(1701.7, dt[dt, .(xa=x.a, ia=i.a), .EACHI, on="a"], data.table(a=1L, xa=1L, ia=1L))
+
 ##########################
 
 # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.