From 45ce60ea522c5054e158b4212df76d1da5db3736 Mon Sep 17 00:00:00 2001
From: Michael Chirico <michael.chirico@grabtaxi.com>
Date: Sun, 11 Aug 2019 13:01:11 +0800
Subject: [PATCH] Closes #3758 and completes coverage of data.table.R

---
 NEWS.md               |  2 ++
 R/bmerge.R            |  2 +-
 R/data.table.R        | 40 +++++++++++++++++------------------
 inst/tests/tests.Rraw | 49 ++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index f6a44b54d..7525a954e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -222,6 +222,8 @@
 
 24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report.
 
+25. Slight fix to the logic for auto-naming the `by` clause for using a custom function like `evaluate` to now be named `evaluate` instead of the name of the first symbolic argument, [#3758](https://github.com/Rdatatable/data.table/issues/3758).
+
 #### NOTES
 
 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
diff --git a/R/bmerge.R b/R/bmerge.R
index 321a27074..d88a81365 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -57,7 +57,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       }
     }
     if (xclass == iclass) {
-      if (verbose) cat("i.",names(i)[ic],"has same type (",xclass,") as x.",names(x)[xc],". No coercion needed.\n", sep="")
+      if (verbose) cat("i.", names(i)[ic], " has same type (", xclass, ") as x.", names(x)[xc], ". No coercion needed.\n", sep="")
       next
     }
     if (xclass=="character" || iclass=="character" ||
diff --git a/R/data.table.R b/R/data.table.R
index a7e085a5f..0d61bb8bc 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -841,8 +841,10 @@ replace_order = function(isub, verbose, env) {
               byvars = all.vars(bysubl[[jj+1L]], functions = TRUE)
               if (length(byvars) == 1L) tt = byvars
               else {
-                tt = grep("^eval|^[^[:alpha:]. ]",byvars,invert=TRUE,value=TRUE)
-                if (length(tt)) tt = tt[1L] else all.vars(bysubl[[jj+1L]])[1L]
+                # take the first variable that is (1) not eval (#3758) and (2) starts with a character that can't start a variable name
+                tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE)
+                # byvars but exclude functions or `0`+`1` becomes `+`
+                tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L]
               }
               # fix for #497
               if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) {
@@ -1087,9 +1089,9 @@ replace_order = function(isub, verbose, env) {
               if (is.list(k)) {
                 origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame())
                 if (is.character(j)) {
-                  if (length(j)!=1L) stop("L[[i]][,:=] syntax only valid when i is length 1, but it's length %d",length(j))
+                  if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length ", length(j))
                   j = match(j, names(k))
-                  if (is.na(j)) stop("Item '",origj,"' not found in names of list")
+                  if (is.na(j)) stop("Internal error -- item '", origj, "' not found in names of list") # nocov
                 }
                 .Call(Csetlistelt,k,as.integer(j), x)
               } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) {
@@ -1118,7 +1120,7 @@ replace_order = function(isub, verbose, env) {
         xcolsAns = seq_along(ansvars)
         icols = icolsAns = integer()
       } else {
-        if (!length(leftcols)) stop("column(s) not found: ", paste(ansvars[wna],collapse=", "))
+        if (!length(leftcols)) stop("Internal error -- column(s) not found: ", paste(ansvars[wna],collapse=", ")) # nocov
         xcols = w[!wna]
         xcolsAns = which(!wna)
         map = c(seq_along(i), leftcols)   # this map is to handle dups in leftcols, #3635
@@ -1131,7 +1133,7 @@ replace_order = function(isub, verbose, env) {
           if (any(w2na <- is.na(w2))) {
             ivars[leftcols] = paste0("i.",ivars[leftcols])
             w2[w2na] = chmatch(ansvars[wna][w2na], ivars)
-            if (any(w2na <- is.na(w2))) stop("column(s) not found: ", paste(ansvars[wna][w2na],sep=", "))
+            if (any(w2na <- is.na(w2))) stop("Internal error -- column(s) not found: ", paste(ansvars[wna][w2na],sep=", ")) # nocov
           }
         }
         icols = w2
@@ -1294,7 +1296,7 @@ replace_order = function(isub, verbose, env) {
           identical(irows, integer(0L)) && !bynull,
           length(irows) && !anyNA(irows) && all(irows==0L) ## anyNA() because all() returns NA (not FALSE) when irows is all-NA. TODO: any way to not check all 'irows' values?
           ))
-        if (is.atomic(jval)) jval = jval[0L] else jval = lapply(jval, `[`, 0L)
+        jval = lapply(jval, `[`, 0L)
       if (is.atomic(jval)) {
         setattr(jval,"names",NULL)
         jval = data.table(jval) # TO DO: should this be setDT(list(jval)) instead?
@@ -1884,7 +1886,7 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) {
   non.numeric = non.atomic = FALSE
   all.logical = TRUE
   for (j in seq_len(p)) {
-    if (is.ff(X[[j]])) X[[j]] = X[[j]][]   # to bring the ff into memory, since we need to create a matrix in memory
+    if (is.ff(X[[j]])) X[[j]] = X[[j]][]   # nocov to bring the ff into memory, since we need to create a matrix in memory
     xj = X[[j]]
     if (length(dj <- dim(xj)) == 2L && dj[2L] > 1L) {
       if (inherits(xj, "data.table"))
@@ -1930,13 +1932,13 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) {
 
 # bug #2375. fixed. same as head.data.frame and tail.data.frame to deal with negative indices
 head.data.table = function(x, n=6L, ...) {
-  if (!cedta()) return(NextMethod())
+  if (!cedta()) return(NextMethod()) # nocov
   stopifnot(length(n) == 1L)
   i = seq_len(if (n<0L) max(nrow(x)+n, 0L) else min(n,nrow(x)))
   x[i, , ]
 }
 tail.data.table = function(x, n=6L, ...) {
-  if (!cedta()) return(NextMethod())
+  if (!cedta()) return(NextMethod()) # nocov
   stopifnot(length(n) == 1L)
   n = if (n<0L) max(nrow(x) + n, 0L) else min(n, nrow(x))
   i = seq.int(to=nrow(x), length.out=n)
@@ -2077,7 +2079,7 @@ within.data.table = function (data, expr, ...)
 # basically within.list but retains key (if any)
 # will be slower than using := or a regular query (see ?within for further info).
 {
-  if (!cedta()) return(NextMethod())
+  if (!cedta()) return(NextMethod()) # nocov
   parent = parent.frame()
   e = evalq(environment(), data, parent)
   eval(substitute(expr), e)  # might (and it's known that some user code does) contain rm()
@@ -2101,7 +2103,7 @@ within.data.table = function (data, expr, ...)
 transform.data.table = function (`_data`, ...)
 # basically transform.data.frame with data.table instead of data.frame, and retains key
 {
-  if (!cedta()) return(NextMethod())
+  if (!cedta()) return(NextMethod()) # nocov
   e = eval(substitute(list(...)), `_data`, parent.frame())
   tags = names(e)
   inx = chmatch(tags, names(`_data`))
@@ -2176,7 +2178,7 @@ any_na = function(x, by=seq_along(x)) .Call(CanyNA, x, by)
 
 na.omit.data.table = function (object, cols = seq_along(object), invert = FALSE, ...) {
   # compare to stats:::na.omit.data.frame
-  if (!cedta()) return(NextMethod())
+  if (!cedta()) return(NextMethod()) # nocov
   if ( !missing(invert) && is.na(as.logical(invert)) )
     stop("Argument 'invert' must be logical TRUE/FALSE")
   if (is.character(cols)) {
@@ -2761,7 +2763,7 @@ rowid = function(..., prefix=NULL) {
 
 rowidv = function(x, cols=seq_along(x), prefix=NULL) {
   if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L))
-    stop("prefix must be NULL or a character vector of length=1.")
+    stop("'prefix' must be NULL or a character vector of length 1.")
   if (is.atomic(x)) {
     if (!missing(cols) && !is.null(cols))
       stop("x is a single vector, non-NULL 'cols' doesn't make sense.")
@@ -2769,7 +2771,7 @@ rowidv = function(x, cols=seq_along(x), prefix=NULL) {
     x = as_list(x)
   } else {
     if (!length(cols))
-      stop("x is a list, 'cols' can not be on 0-length.")
+      stop("x is a list, 'cols' cannot be 0-length.")
     if (is.character(cols))
       cols = chmatch(cols, names(x))
     cols = as.integer(cols)
@@ -2790,7 +2792,7 @@ rleid = function(..., prefix=NULL) {
 
 rleidv = function(x, cols=seq_along(x), prefix=NULL) {
   if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L))
-    stop("prefix must be NULL or a character vector of length=1.")
+    stop("'prefix' must be NULL or a character vector of length 1.")
   if (is.atomic(x)) {
     if (!missing(cols) && !is.null(cols))
       stop("x is a single vector, non-NULL 'cols' doesn't make sense.")
@@ -2798,7 +2800,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) {
     x = as_list(x)
   } else {
     if (!length(cols))
-      stop("x is a list, 'cols' can not be 0-length.")
+      stop("x is a list, 'cols' cannot be 0-length.")
     if (is.character(cols))
       cols = chmatch(cols, names(x))
     cols = as.integer(cols)
@@ -2880,7 +2882,7 @@ isReallyReal = function(x) {
       ## redirect to normal DT[x == TRUE]
       stub = call("==", as.symbol(col), TRUE)
     }
-    if (length(stub[[1L]]) != 1) return(NULL) ## Whatever it is, definitely not one of the valid operators
+    if (length(stub[[1L]]) != 1) return(NULL) # nocov Whatever it is, definitely not one of the valid operators
     operator = as.character(stub[[1L]])
     if (!operator %chin% validOps$op) return(NULL) ## operator not supported
     if (!is.name(stub[[2L]])) return(NULL)
@@ -2902,7 +2904,6 @@ isReallyReal = function(x) {
       # the mode() checks also deals with NULL since mode(NULL)=="NULL" and causes this return, as one CRAN package (eplusr 0.9.1) relies on
       return(NULL)
     }
-    if(is.character(x[[col]]) && !operator %chin% c("==", "%in%", "%chin%")) return(NULL) ## base R allows for non-equi operators on character columns, but these can't be optimized.
     if (!operator %chin% c("%in%", "%chin%")) {
       # additional requirements for notjoin and NA values. Behaviour is different for %in%, %chin% compared to other operators
       # RHS is of length=1 or n
@@ -2998,7 +2999,6 @@ isReallyReal = function(x) {
   pat = paste0("(", ops, ")", collapse="|")
   if (is.call(onsub) && onsub[[1L]] == "eval") {
     onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L))
-    if (is.call(onsub) && onsub[[1L]] == "eval") { onsub = onsub[[2L]] }
   }
   if (is.call(onsub) && as.character(onsub[[1L]]) %chin% c("list", ".")) {
     spat = paste0("[ ]+(", pat, ")[ ]+")
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 2a7d5d65e..4077654e3 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -1653,6 +1653,7 @@ test(567, DT[,.N,list(a,b)][,N,by=a]$N, c(1L,1L,2L,1L))
 test(568, DT[,.N,list(a,b)][,unique(N),by=a]$V1, c(1L,2L,1L))
 test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be grouped because")
 test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because")
+test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be grouped because")
 
 # Test spaces in by="..." format, datatable-help on 31 March
 DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6)
@@ -15001,7 +15002,7 @@ test(2050.6, rbind(DT[1], data.table(f=factor(letters[10:11]))[0])[,levels(f)],
 test(2051.1, `-.IDate`(structure(0, class="Date"), 1L), structure(-1, class="Date"))
 test(2051.2, `-.IDate`(1L, 1L), error = 'only subtract from "IDate"')
 test(2051.3, format.data.table(1L), error = 'Possibly corrupt data.table')
-test(2051.4, rleidv(prefix = 1L), error = 'prefix must be NULL or')
+test(2051.4, rleidv(prefix = 1L), error = "'prefix' must be NULL or")
 ## passing Date to second argument of as.POSIXct.ITime
 t = as.ITime(0L)
 test(2051.5, as.POSIXct(t, structure(0L, class="Date")), .POSIXct(0, 'UTC'))
@@ -15522,6 +15523,52 @@ test(2073.7, transpose(L, make.names=NA), error="make.names=NA is out of range [
 test(2073.8, transpose(L, make.names=2), list(A=INT(1,4), B=INT(2,5), C=INT(3,6)))
 test(2073.9, transpose(L, make.names=2, keep.names='foo'), list(foo=c("a","b"), A=INT(1,4), B=INT(2,5), C=INT(3,6)))
 
+# coverage of data.table.R
+## i is NULL
+test(2074.01, data.table(1:10)[NULL], data.table(NULL))
+## auto-guessing of byvars when none of the columns have "normal" names
+test(2074.02, data.table(`0`=0, `1`=1)[ , TRUE, by = .(`0` + `1`)], data.table(`0`=1, V1=TRUE))
+## also eval.+ columns are OK, just not eval( patterns, #3758
+evaluate = function(x) c('F', 'D', 'C', 'B', 'A')[findInterval(x, c(0, 60, 70, 80, 90, 100))]
+test(2074.03, data.table(grade=c(50L, 91L, 95L, 51L, 89L))[ , .N, by=evaluate(grade)],
+     data.table(evaluate=c('F', 'A', 'B'), N=c(2L, 2L, 1L)))
+## error: use recursive character list indexing to assign when also doing alloc.col()
+opt = options(datatable.alloccol=1L)
+l = list(foo = list(bar = data.table(a = 1:3, b = 4:6)))
+test(2074.04, l[[c('foo', 'bar')]][ , (letters) := 16:18], error = 'under-allocated recursively indexed list')
+options(opt)
+## alloc.col when using 0-truelength j assigning to a subset
+DT = data.table(a=1)
+### construct incorrectly to have 0 truelength
+zDT = structure(list(b=2), class = c('data.table', 'data.frame'))
+test(2074.05, DT[1L, b := zDT], data.table(a=1, b=2))
+## nested .SD in j
+DT = data.table(a=1, b=2)
+test(2074.06, DT[ , c(.SD[1], .SD[1, .SD[1]]), by=a], data.table(a=1, b=2, b=2))
+## as.matrix.data.table when a column has columns (only possible when constructed incorrectly)
+DT = structure(list(a=1:5, d=data.table(b=6:10, c=11:15), m=matrix(16:25, ncol=2L)), class = c('data.table', 'data.frame'))
+test(2074.07, as.matrix(DT), matrix(1:25, ncol=5L, dimnames=list(NULL, c('a', 'd.b', 'd.c', 'm.1', 'm.2'))))
+## can induce !cedta() from base::rownames to get this error
+test(2074.08, rownames(structure(list(1:5), class='data.table')), error="Has it been created manually")
+## default dimnames.data.table
+test(2074.09, dimnames(data.table(a = 1)), list(NULL, 'a'))
+## unlock argument of .shallow
+DT = data.table(a = 1)
+setattr(DT, '.data.table.locked', TRUE)
+test(2074.10, attr(.shallow(DT, unlock=TRUE), '.data.table.locked'), NULL)
+## coverage of rowidv & rleidv
+test(2074.11, rowidv(1:10, cols=1), error="x is a single vector, non-NULL 'cols'")
+test(2074.12, rowidv(1:10), rep(1L, 10L))
+test(2074.13, rowidv(list(1:10), cols=integer()), error="x is a list, 'cols' cannot be 0-length")
+test(2074.14, rleidv(1:10, cols=1), error="x is a single vector, non-NULL 'cols'")
+test(2074.15, rleidv(list(1:10), cols=integer()), error="x is a list, 'cols' cannot be 0-length")
+## coverage of .prepareFastSubset
+DT = data.table(V1=c('a', 'b', 'a'), V2 = c('hello', 'ello', 'llo'), x=TRUE)
+test(2074.16, nrow(DT[!(V1=='a' & V2 %like% 'll')]), 1L)
+y = c(TRUE, FALSE, FALSE)
+test(2074.17, nrow(DT[x & y]), 1L)
+setkey(DT, V1)
+test(2074.18, DT[V1=='a', verbose=TRUE], output='Optimized subsetting with key')
 
 ###################################
 #  Add new tests above this line  #