From 45ce60ea522c5054e158b4212df76d1da5db3736 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 11 Aug 2019 13:01:11 +0800 Subject: [PATCH] Closes #3758 and completes coverage of data.table.R --- NEWS.md | 2 ++ R/bmerge.R | 2 +- R/data.table.R | 40 +++++++++++++++++------------------ inst/tests/tests.Rraw | 49 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 71 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index f6a44b54d..7525a954e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -222,6 +222,8 @@ 24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. +25. Slight fix to the logic for auto-naming the `by` clause for using a custom function like `evaluate` to now be named `evaluate` instead of the name of the first symbolic argument, [#3758](https://github.com/Rdatatable/data.table/issues/3758). + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/bmerge.R b/R/bmerge.R index 321a27074..d88a81365 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -57,7 +57,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } if (xclass == iclass) { - if (verbose) cat("i.",names(i)[ic],"has same type (",xclass,") as x.",names(x)[xc],". No coercion needed.\n", sep="") + if (verbose) cat("i.", names(i)[ic], " has same type (", xclass, ") as x.", names(x)[xc], ". No coercion needed.\n", sep="") next } if (xclass=="character" || iclass=="character" || diff --git a/R/data.table.R b/R/data.table.R index a7e085a5f..0d61bb8bc 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -841,8 +841,10 @@ replace_order = function(isub, verbose, env) { byvars = all.vars(bysubl[[jj+1L]], functions = TRUE) if (length(byvars) == 1L) tt = byvars else { - tt = grep("^eval|^[^[:alpha:]. ]",byvars,invert=TRUE,value=TRUE) - if (length(tt)) tt = tt[1L] else all.vars(bysubl[[jj+1L]])[1L] + # take the first variable that is (1) not eval (#3758) and (2) starts with a character that can't start a variable name + tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE) + # byvars but exclude functions or `0`+`1` becomes `+` + tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L] } # fix for #497 if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { @@ -1087,9 +1089,9 @@ replace_order = function(isub, verbose, env) { if (is.list(k)) { origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) if (is.character(j)) { - if (length(j)!=1L) stop("L[[i]][,:=] syntax only valid when i is length 1, but it's length %d",length(j)) + if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length ", length(j)) j = match(j, names(k)) - if (is.na(j)) stop("Item '",origj,"' not found in names of list") + if (is.na(j)) stop("Internal error -- item '", origj, "' not found in names of list") # nocov } .Call(Csetlistelt,k,as.integer(j), x) } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { @@ -1118,7 +1120,7 @@ replace_order = function(isub, verbose, env) { xcolsAns = seq_along(ansvars) icols = icolsAns = integer() } else { - if (!length(leftcols)) stop("column(s) not found: ", paste(ansvars[wna],collapse=", ")) + if (!length(leftcols)) stop("Internal error -- column(s) not found: ", paste(ansvars[wna],collapse=", ")) # nocov xcols = w[!wna] xcolsAns = which(!wna) map = c(seq_along(i), leftcols) # this map is to handle dups in leftcols, #3635 @@ -1131,7 +1133,7 @@ replace_order = function(isub, verbose, env) { if (any(w2na <- is.na(w2))) { ivars[leftcols] = paste0("i.",ivars[leftcols]) w2[w2na] = chmatch(ansvars[wna][w2na], ivars) - if (any(w2na <- is.na(w2))) stop("column(s) not found: ", paste(ansvars[wna][w2na],sep=", ")) + if (any(w2na <- is.na(w2))) stop("Internal error -- column(s) not found: ", paste(ansvars[wna][w2na],sep=", ")) # nocov } } icols = w2 @@ -1294,7 +1296,7 @@ replace_order = function(isub, verbose, env) { identical(irows, integer(0L)) && !bynull, length(irows) && !anyNA(irows) && all(irows==0L) ## anyNA() because all() returns NA (not FALSE) when irows is all-NA. TODO: any way to not check all 'irows' values? )) - if (is.atomic(jval)) jval = jval[0L] else jval = lapply(jval, `[`, 0L) + jval = lapply(jval, `[`, 0L) if (is.atomic(jval)) { setattr(jval,"names",NULL) jval = data.table(jval) # TO DO: should this be setDT(list(jval)) instead? @@ -1884,7 +1886,7 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { non.numeric = non.atomic = FALSE all.logical = TRUE for (j in seq_len(p)) { - if (is.ff(X[[j]])) X[[j]] = X[[j]][] # to bring the ff into memory, since we need to create a matrix in memory + if (is.ff(X[[j]])) X[[j]] = X[[j]][] # nocov to bring the ff into memory, since we need to create a matrix in memory xj = X[[j]] if (length(dj <- dim(xj)) == 2L && dj[2L] > 1L) { if (inherits(xj, "data.table")) @@ -1930,13 +1932,13 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { # bug #2375. fixed. same as head.data.frame and tail.data.frame to deal with negative indices head.data.table = function(x, n=6L, ...) { - if (!cedta()) return(NextMethod()) + if (!cedta()) return(NextMethod()) # nocov stopifnot(length(n) == 1L) i = seq_len(if (n<0L) max(nrow(x)+n, 0L) else min(n,nrow(x))) x[i, , ] } tail.data.table = function(x, n=6L, ...) { - if (!cedta()) return(NextMethod()) + if (!cedta()) return(NextMethod()) # nocov stopifnot(length(n) == 1L) n = if (n<0L) max(nrow(x) + n, 0L) else min(n, nrow(x)) i = seq.int(to=nrow(x), length.out=n) @@ -2077,7 +2079,7 @@ within.data.table = function (data, expr, ...) # basically within.list but retains key (if any) # will be slower than using := or a regular query (see ?within for further info). { - if (!cedta()) return(NextMethod()) + if (!cedta()) return(NextMethod()) # nocov parent = parent.frame() e = evalq(environment(), data, parent) eval(substitute(expr), e) # might (and it's known that some user code does) contain rm() @@ -2101,7 +2103,7 @@ within.data.table = function (data, expr, ...) transform.data.table = function (`_data`, ...) # basically transform.data.frame with data.table instead of data.frame, and retains key { - if (!cedta()) return(NextMethod()) + if (!cedta()) return(NextMethod()) # nocov e = eval(substitute(list(...)), `_data`, parent.frame()) tags = names(e) inx = chmatch(tags, names(`_data`)) @@ -2176,7 +2178,7 @@ any_na = function(x, by=seq_along(x)) .Call(CanyNA, x, by) na.omit.data.table = function (object, cols = seq_along(object), invert = FALSE, ...) { # compare to stats:::na.omit.data.frame - if (!cedta()) return(NextMethod()) + if (!cedta()) return(NextMethod()) # nocov if ( !missing(invert) && is.na(as.logical(invert)) ) stop("Argument 'invert' must be logical TRUE/FALSE") if (is.character(cols)) { @@ -2761,7 +2763,7 @@ rowid = function(..., prefix=NULL) { rowidv = function(x, cols=seq_along(x), prefix=NULL) { if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L)) - stop("prefix must be NULL or a character vector of length=1.") + stop("'prefix' must be NULL or a character vector of length 1.") if (is.atomic(x)) { if (!missing(cols) && !is.null(cols)) stop("x is a single vector, non-NULL 'cols' doesn't make sense.") @@ -2769,7 +2771,7 @@ rowidv = function(x, cols=seq_along(x), prefix=NULL) { x = as_list(x) } else { if (!length(cols)) - stop("x is a list, 'cols' can not be on 0-length.") + stop("x is a list, 'cols' cannot be 0-length.") if (is.character(cols)) cols = chmatch(cols, names(x)) cols = as.integer(cols) @@ -2790,7 +2792,7 @@ rleid = function(..., prefix=NULL) { rleidv = function(x, cols=seq_along(x), prefix=NULL) { if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L)) - stop("prefix must be NULL or a character vector of length=1.") + stop("'prefix' must be NULL or a character vector of length 1.") if (is.atomic(x)) { if (!missing(cols) && !is.null(cols)) stop("x is a single vector, non-NULL 'cols' doesn't make sense.") @@ -2798,7 +2800,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { x = as_list(x) } else { if (!length(cols)) - stop("x is a list, 'cols' can not be 0-length.") + stop("x is a list, 'cols' cannot be 0-length.") if (is.character(cols)) cols = chmatch(cols, names(x)) cols = as.integer(cols) @@ -2880,7 +2882,7 @@ isReallyReal = function(x) { ## redirect to normal DT[x == TRUE] stub = call("==", as.symbol(col), TRUE) } - if (length(stub[[1L]]) != 1) return(NULL) ## Whatever it is, definitely not one of the valid operators + if (length(stub[[1L]]) != 1) return(NULL) # nocov Whatever it is, definitely not one of the valid operators operator = as.character(stub[[1L]]) if (!operator %chin% validOps$op) return(NULL) ## operator not supported if (!is.name(stub[[2L]])) return(NULL) @@ -2902,7 +2904,6 @@ isReallyReal = function(x) { # the mode() checks also deals with NULL since mode(NULL)=="NULL" and causes this return, as one CRAN package (eplusr 0.9.1) relies on return(NULL) } - if(is.character(x[[col]]) && !operator %chin% c("==", "%in%", "%chin%")) return(NULL) ## base R allows for non-equi operators on character columns, but these can't be optimized. if (!operator %chin% c("%in%", "%chin%")) { # additional requirements for notjoin and NA values. Behaviour is different for %in%, %chin% compared to other operators # RHS is of length=1 or n @@ -2998,7 +2999,6 @@ isReallyReal = function(x) { pat = paste0("(", ops, ")", collapse="|") if (is.call(onsub) && onsub[[1L]] == "eval") { onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L)) - if (is.call(onsub) && onsub[[1L]] == "eval") { onsub = onsub[[2L]] } } if (is.call(onsub) && as.character(onsub[[1L]]) %chin% c("list", ".")) { spat = paste0("[ ]+(", pat, ")[ ]+") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2a7d5d65e..4077654e3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1653,6 +1653,7 @@ test(567, DT[,.N,list(a,b)][,N,by=a]$N, c(1L,1L,2L,1L)) test(568, DT[,.N,list(a,b)][,unique(N),by=a]$V1, c(1L,2L,1L)) test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be grouped because") test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because") +test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be grouped because") # Test spaces in by="..." format, datatable-help on 31 March DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) @@ -15001,7 +15002,7 @@ test(2050.6, rbind(DT[1], data.table(f=factor(letters[10:11]))[0])[,levels(f)], test(2051.1, `-.IDate`(structure(0, class="Date"), 1L), structure(-1, class="Date")) test(2051.2, `-.IDate`(1L, 1L), error = 'only subtract from "IDate"') test(2051.3, format.data.table(1L), error = 'Possibly corrupt data.table') -test(2051.4, rleidv(prefix = 1L), error = 'prefix must be NULL or') +test(2051.4, rleidv(prefix = 1L), error = "'prefix' must be NULL or") ## passing Date to second argument of as.POSIXct.ITime t = as.ITime(0L) test(2051.5, as.POSIXct(t, structure(0L, class="Date")), .POSIXct(0, 'UTC')) @@ -15522,6 +15523,52 @@ test(2073.7, transpose(L, make.names=NA), error="make.names=NA is out of range [ test(2073.8, transpose(L, make.names=2), list(A=INT(1,4), B=INT(2,5), C=INT(3,6))) test(2073.9, transpose(L, make.names=2, keep.names='foo'), list(foo=c("a","b"), A=INT(1,4), B=INT(2,5), C=INT(3,6))) +# coverage of data.table.R +## i is NULL +test(2074.01, data.table(1:10)[NULL], data.table(NULL)) +## auto-guessing of byvars when none of the columns have "normal" names +test(2074.02, data.table(`0`=0, `1`=1)[ , TRUE, by = .(`0` + `1`)], data.table(`0`=1, V1=TRUE)) +## also eval.+ columns are OK, just not eval( patterns, #3758 +evaluate = function(x) c('F', 'D', 'C', 'B', 'A')[findInterval(x, c(0, 60, 70, 80, 90, 100))] +test(2074.03, data.table(grade=c(50L, 91L, 95L, 51L, 89L))[ , .N, by=evaluate(grade)], + data.table(evaluate=c('F', 'A', 'B'), N=c(2L, 2L, 1L))) +## error: use recursive character list indexing to assign when also doing alloc.col() +opt = options(datatable.alloccol=1L) +l = list(foo = list(bar = data.table(a = 1:3, b = 4:6))) +test(2074.04, l[[c('foo', 'bar')]][ , (letters) := 16:18], error = 'under-allocated recursively indexed list') +options(opt) +## alloc.col when using 0-truelength j assigning to a subset +DT = data.table(a=1) +### construct incorrectly to have 0 truelength +zDT = structure(list(b=2), class = c('data.table', 'data.frame')) +test(2074.05, DT[1L, b := zDT], data.table(a=1, b=2)) +## nested .SD in j +DT = data.table(a=1, b=2) +test(2074.06, DT[ , c(.SD[1], .SD[1, .SD[1]]), by=a], data.table(a=1, b=2, b=2)) +## as.matrix.data.table when a column has columns (only possible when constructed incorrectly) +DT = structure(list(a=1:5, d=data.table(b=6:10, c=11:15), m=matrix(16:25, ncol=2L)), class = c('data.table', 'data.frame')) +test(2074.07, as.matrix(DT), matrix(1:25, ncol=5L, dimnames=list(NULL, c('a', 'd.b', 'd.c', 'm.1', 'm.2')))) +## can induce !cedta() from base::rownames to get this error +test(2074.08, rownames(structure(list(1:5), class='data.table')), error="Has it been created manually") +## default dimnames.data.table +test(2074.09, dimnames(data.table(a = 1)), list(NULL, 'a')) +## unlock argument of .shallow +DT = data.table(a = 1) +setattr(DT, '.data.table.locked', TRUE) +test(2074.10, attr(.shallow(DT, unlock=TRUE), '.data.table.locked'), NULL) +## coverage of rowidv & rleidv +test(2074.11, rowidv(1:10, cols=1), error="x is a single vector, non-NULL 'cols'") +test(2074.12, rowidv(1:10), rep(1L, 10L)) +test(2074.13, rowidv(list(1:10), cols=integer()), error="x is a list, 'cols' cannot be 0-length") +test(2074.14, rleidv(1:10, cols=1), error="x is a single vector, non-NULL 'cols'") +test(2074.15, rleidv(list(1:10), cols=integer()), error="x is a list, 'cols' cannot be 0-length") +## coverage of .prepareFastSubset +DT = data.table(V1=c('a', 'b', 'a'), V2 = c('hello', 'ello', 'llo'), x=TRUE) +test(2074.16, nrow(DT[!(V1=='a' & V2 %like% 'll')]), 1L) +y = c(TRUE, FALSE, FALSE) +test(2074.17, nrow(DT[x & y]), 1L) +setkey(DT, V1) +test(2074.18, DT[V1=='a', verbose=TRUE], output='Optimized subsetting with key') ################################### # Add new tests above this line #