diff --git a/.Rbuildignore b/.Rbuildignore index 17a128d24..2d304c068 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -21,3 +21,4 @@ ^bus$ ^Dockerfile$ ^Dockerfile\.in$ +^.*\.dll$ diff --git a/.gitignore b/.gitignore index ce2767d71..b3c264606 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ vignettes/*.pdf # object and shared objects *.o *.so +*.dll *~ .DS_Store diff --git a/NEWS.md b/NEWS.md index 7ebd05548..c9f997b63 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,3 @@ - **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** ### Changes in v1.11.9 (to be v1.12.0) @@ -9,6 +8,8 @@ 2. `nomatch=NULL` now does the same as `nomatch=0L`; i.e. discards missing values silently (inner join). The default is still `nomatch=NA` (outer join) for statistical safety so that missing values are retained by default. You have to explicitly write `nomatch=NULL` to indicate to the reader of your code that you intend to discard missing values silently. After several years have elapsed, we will start to deprecate `0L`; please start using `NULL`. TO DO ... `nomatch=.(0)` fills with `0` instead of `NA`, [#857](https://github.com/Rdatatable/data.table/issues/857) and `nomatch="error"`. +3. `setnames()` gains `skip_absent` to skip names in `old` that aren't present, [#3030](https://github.com/Rdatatable/data.table/issues/3030). By default `FALSE` so that it is still an error, as before, to attempt to change a column name that is not present. Thanks to @MusTheDataGuy for the suggestion and the PR. + #### BUG FIXES 1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. @@ -542,5 +543,3 @@ When `j` is a symbol (as in the quanteda and xgboost examples above) it will con ### Old news from v1.9.8 (Nov 2016) back to v1.2 (Aug 2008) has been moved to [NEWS.0.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.0.md) - - diff --git a/R/data.table.R b/R/data.table.R index d4e7f968b..e0b1a12d3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1,4 +1,3 @@ - if (!exists("trimws", "package:base")) { # trimws was new in R 3.2.0. Backport it for internal data.table use in R 3.1.0 trimws <- function(x) { @@ -2506,12 +2505,13 @@ setattr <- function(x,name,value) { invisible(x) } -setnames <- function(x,old,new) { +setnames <- function(x,old,new,skip_absent=FALSE) { # Sets by reference, maintains truelength, no copy of table at all. # But also more convenient than names(DT)[i]="newname" because we can also do setnames(DT,"oldname","newname") # without an onerous match() ourselves. old can be positions, too, but we encourage by name for robustness. if (!is.data.frame(x)) stop("x is not a data.table or data.frame") if (length(names(x)) != length(x)) stop("dt is length ",length(dt)," but its names are length ",length(names(x))) + stopifnot(isTRUE(skip_absent) || identical(skip_absent,FALSE)) if (missing(new)) { # for setnames(DT,new); e.g., setnames(DT,c("A","B")) where ncol(DT)==2 if (!is.character(old)) stop("Passed a vector of type '",typeof(old),"'. Needs to be type 'character'.") @@ -2541,7 +2541,16 @@ setnames <- function(x,old,new) { if (!is.character(old)) stop("'old' is type ",typeof(old)," but should be integer, double or character") if (any(duplicated(old))) stop("Some duplicates exist in 'old': ", paste(old[duplicated(old)],collapse=",")) i = chmatch(old,names(x)) - if (anyNA(i)) stop("Items of 'old' not found in column names: ",paste(old[is.na(i)],collapse=",")) + if (anyNA(i)) { + if (isTRUE(skip_absent)) { + w <- old %chin% names(x) + old = old[w] + new = new[w] + i = i[w] + } else { + stop("Items of 'old' not found in column names: ",paste(old[is.na(i)],collapse=","), ". Consider skip_absent=TRUE.") + } + } if (any(tt<-!is.na(chmatch(old,names(x)[-i])))) stop("Some items of 'old' are duplicated (ambiguous) in column names: ",paste(old[tt],collapse=",")) } if (length(new)!=length(i)) stop("'old' is length ",length(i)," but 'new' is length ",length(new)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 08858e7e5..b427e022d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12354,6 +12354,18 @@ test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L)) +# skip values that are not present in old, #3030 +DT <- data.table(a=1, b=2, d=3) +old <- c("a", "b", "c", "d") +new <- c("A", "B", "C", "D") +test(1955.1, setnames(DT, old, new, skip_absent=TRUE), data.table(A=1, B=2, D=3)) +test(1955.2, setnames(DT, old, new, skip_absent=0), error="is not") # must be TRUE or FALSE +test(1955.3, setnames(DT, "missing", "dummy", skip_absent=TRUE), DT) # all missing +test(1955.4, setnames(DT, c("D","missing","A"), c("dd","ignored","aa"), skip_absent=TRUE), data.table(aa=1, B=2, dd=3)) # different order with a missing +test(1955.5, setnames(DT, "B", "bb", skip_absent=TRUE), data.table(aa=1, bb=2, dd=3)) # none missing so skip_absent not needed +test(1955.6, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D")), error="Items of 'old' not found in column names: miss1,miss2. Consider skip_absent=TRUE") +test(1955.7, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D"), skip_absent=TRUE), data.table(aa=1, B=2, D=3)) + ################################### # Add new tests above this line # diff --git a/man/setattr.Rd b/man/setattr.Rd index 90d72d440..cb9f69e2e 100644 --- a/man/setattr.Rd +++ b/man/setattr.Rd @@ -7,7 +7,7 @@ } \usage{ setattr(x,name,value) -setnames(x,old,new) +setnames(x,old,new,skip_absent=FALSE) } \arguments{ \item{x}{ \code{setnames} accepts \code{data.frame} and \code{data.table}. \code{setattr} accepts any input; e.g, list, columns of a \code{data.frame} or \code{data.table}. } @@ -15,7 +15,9 @@ setnames(x,old,new) \item{value}{ The value to assign to the attribute or \code{NULL} removes the attribute, if present. } \item{old}{ When \code{new} is provided, character names or numeric positions of column names to change. When \code{new} is not provided, the new column names, which must be the same length as the number of columns. See examples. } \item{new}{ Optional. New column names, must be the same length as columns provided to \code{old} argument. } + \item{skip_absent}{ Skip items in \code{old} that are missing (i.e. absent) in `names(x)`. Default \code{FALSE} halts with error if any are missing. } } + \details{ \code{setnames} operates on \code{data.table} and \code{data.frame} not other types like \code{list} and \code{vector}. It can be used to change names \emph{by name} with built-in checks and warnings (e.g., if any old names are missing or appear more than once). @@ -34,6 +36,13 @@ setnames(x,old,new) } \examples{ +DT <- data.table(a = 1, b = 2, d = 3) + +old <- c("a", "b", "c", "d") +new <- c("A", "B", "C", "D") + +setnames(DT, old, new, skip_absent = TRUE) # skips `old[3]` because `"c"` is not a column name of `DT` + DF = data.frame(a=1:2,b=3:4) # base data.frame to demo copies and syntax if (capabilities()["profmem"]) # usually memory profiling is available but just in case tracemem(DF) @@ -70,4 +79,3 @@ attr(DT,"myFlag2") # NULL } \keyword{ data } -