as.data.table.array method, closes #1418

Rdatatable · Dec 4, 2016 · 0ecb60b · 0ecb60b
1 parent 6706882
commit 0ecb60b
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -49,6 +49,7 @@ S3method("$<-", data.table)
 S3method(print, data.table)
 S3method(as.data.table, data.table)
 S3method(as.data.table, data.frame)
+S3method(as.data.table, array)
 S3method(as.data.table, matrix)
 S3method(as.data.table, list)
 S3method(as.data.table, integer)

diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,8 @@
 
 1. `indices()` function gain new argument `vectors` default `FALSE`, when `TRUE` provided then list of vectors is returned, single vector refers to single index. Closes #1589.
 
+2. `as.data.table()` gains new method for `array`s, it will now return useful data.table. Closes #1418.
+
 #### BUG FIXES
 
 #### NOTES

diff --git a/R/as.data.table.R b/R/as.data.table.R
@@ -93,6 +93,37 @@ as.data.table.matrix <- function(x, keep.rownames=FALSE, ...) {
     alloc.col(value)
 }
 
+# as.data.table.array - #1418
+as.data.table.array <- function(x, keep.rownames=FALSE, sorted=TRUE, value.name="value", na.rm=TRUE, ...) {
+    dx = dim(x)
+    if (length(dx) <= 2L)
+        stop("as.data.table.array method should be only called for arrays with 3+ dimensions, for 2 dimensions matrix method should be used")
+    if (!is.character(value.name) || length(value.name)!=1L || is.na(value.name) || !nzchar(value.name))
+        stop("Argument 'value.name' must be scalar character, non-NA and non zero char")
+    if (!is.logical(sorted) || length(sorted)!=1L || is.na(sorted))
+        stop("Argument 'sorted' must be scalar logical and non-NA")
+    if (!is.logical(na.rm) || length(na.rm)!=1L || is.na(na.rm))
+        stop("Argument 'na.rm' must be scalar logical and non-NA")
+
+    dnx = dimnames(x)
+    # NULL dimnames will create integer keys, not character as in table method
+    val = rev(if (is.null(dnx)) lapply(dim(x), seq.int) else dnx)
+    if (is.null(names(val)) || all(!nzchar(names(val))))
+        setattr(val, 'names', paste("V", rev(seq_along(val)), sep=""))
+    if (value.name %in% names(val))
+        stop(sprintf("Argument 'value.name' should not overlap with column names in result: %s.", paste(rev(names(val)), collapse=", ")))
+    N = NULL
+    ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N=as.vector(x))
+    if (isTRUE(na.rm))
+        ans = ans[!is.na(N)]
+    setnames(ans, "N", value.name)
+    dims = rev(head(names(ans), -1))
+    setcolorder(ans, c(dims, value.name))
+    if (isTRUE(sorted))
+        setkeyv(ans, dims)
+    ans[]
+}
+
 as.data.table.list <- function(x, keep.rownames=FALSE, ...) {
     if (!length(x)) return( null.data.table() )
     # fix for #833, as.data.table.list with matrix/data.frame/data.table as a list element..

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -9789,6 +9789,113 @@ indices(DT, vectors = TRUE)
 test(1749.1, indices(DT), c("A__B","A","B"))
 test(1749.2, indices(DT, vectors = TRUE), list(c("A","B"),"A","B"))
 
+set.seed(1L)
+ar.dimnames = list(color = sort(c("green","yellow","red")), 
+                   year = as.character(2011:2015), 
+                   status = sort(c("active","inactive","archived","removed")))
+ar.dim = sapply(ar.dimnames, length)
+ar = array(sample(c(rep(NA, 4), 4:7/2), prod(ar.dim), TRUE),
+           unname(ar.dim), # array() having length(dims) < 3 will be created as matrix in R so will not be dispatched here but as.data.table.matrix
+           ar.dimnames)
+dt = as.data.table(ar, na.rm=FALSE)
+dimcols = head(names(dt), -1L)
+test(1750.1, TRUE, all(
+    nrow(dt) == 60L,
+    prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
+    dt[is.na(value), .N] == 30L,
+    dt[, .N==1L, c(dimcols)]$V1
+))
+dt = as.data.table(ar)
+dimcols = head(names(dt), -1L)
+test(1750.2, TRUE, all(
+    nrow(dt) == 30L,
+    prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
+    dt[is.na(value), .N] == 0L,
+    dt[, .N==1L, c(dimcols)]$V1
+))
+# 4D unnamed
+x = array(1:81, dim=rep(3L,4))
+dt = as.data.table(x, na.rm=FALSE)
+test(1750.3, TRUE, all(
+    identical(dim(dt), c(81L,5L)),
+    identical(names(dt), c(paste0("V",1:4),"value")),
+    all(dt[J(1L)][1L, value] == 1L, dt[J(2L)][1L, value] == 2L, dt[J(3L)][.N, value] == 81L) # this also tests if dt is keyed
+))
+# 4D named dim values but not dims
+x = array(1:81, dim=rep(3L, 4L), dimnames=rep(list(letters[1:3]), 4L))
+dt = as.data.table(x, na.rm=FALSE)
+test(1750.4, TRUE, all(
+    identical(dim(dt), c(81L,5L)),
+    identical(names(dt), c(paste0("V",1:4),"value")),
+    all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L)
+))
+# 4D named dim values and dims
+x = array(1:81, dim=rep(3L, 4L), dimnames=setNames(rep(list(letters[1:3]), 4L), letters[1:4]))
+dt = as.data.table(x, na.rm=FALSE)
+test(1750.5, TRUE, all(
+    identical(dim(dt), c(81L,5L)),
+    identical(names(dt), c(letters[1:4],"value")),
+    all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L)
+))
+# third dim of length 1L so really 2D
+x = array(1:4, dim=c(2L,2L,1L), dimnames=list(a=letters[1:2], b=letters[1:2], c="a"))
+dt = as.data.table(x, na.rm=FALSE)
+test(1750.6, TRUE, all(
+    identical(dim(dt), c(4L,4L)),
+    identical(names(dt), c("a","b","c","value")),
+    all(dt[J("a")][, value] == c(1L,3L), dt[J("b")][, value] == c(2L,4L))
+))
+# second and third dim of length 1L so really 1D
+x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", c="a"))
+dt = as.data.table(x, na.rm=FALSE)
+test(1750.7, TRUE, all(
+    identical(dim(dt), c(2L,4L)),
+    identical(names(dt), c("a","b","c","value")),
+    all(dt[J("a")][, value] == 1L, dt[J("b")][, value] == 2L)
+))
+# 3x3x3 na.rm=FALSE / sorted=TRUE
+set.seed(2)
+x = rnorm(27)
+x[sample(length(x), length(x)/2)] = NA
+dim(x) = c(3L,3L,3L)
+dt = as.data.table(x, na.rm=FALSE)
+test(1750.8, TRUE, all(
+    identical(dim(dt), c(27L,4L)),
+    identical(names(dt), c(paste0("V",1:3),"value")),
+    dt[is.na(value), .N] > 0L,
+    length(key(dt)) > 0L
+))
+# na.rm=TRUE / sorted=TRUE
+dt = as.data.table(x)
+test(1750.9, TRUE, all(
+    identical(dim(dt), c(14L,4L)),
+    identical(names(dt), c(paste0("V",1:3),"value")),
+    dt[is.na(value), .N] == 0L,
+    length(key(dt)) > 0L
+))
+# na.rm=TRUE / sorted=FALSE
+dt = as.data.table(x, sorted=FALSE)
+test(1750.10, TRUE, all(
+    identical(dim(dt), c(14L,4L)),
+    identical(names(dt), c(paste0("V",1:3),"value")),
+    dt[is.na(value), .N] == 0L,
+    is.unsorted(dt[[1]]),
+    is.null(key(dt))
+))
+# na.rm=FALSE / sorted=FALSE
+dt = as.data.table(x, na.rm=FALSE, sorted=FALSE)
+test(1750.11, TRUE, all(
+    identical(dim(dt), c(27L,4L)),
+    identical(names(dt), c(paste0("V",1:3),"value")),
+    is.unsorted(dt[[1]]),
+    is.null(key(dt))
+))
+# expects error on value.name overlapping with column names in result (dimension names)
+x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", c="a"))
+test(1750.12, as.data.table(x, value.name="a"), error = "Argument 'value.name' should not overlap with column names in result")
+x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", value="a"))
+test(1750.13, as.data.table(x), error = "Argument 'value.name' should not overlap with column names in result")
+
 
 ##########################
 

diff --git a/man/as.data.table.Rd b/man/as.data.table.Rd
@@ -1,5 +1,6 @@
 \name{as.data.table}
 \alias{as.data.table}
+\alias{as.data.table.array}
 \alias{as.data.table.matrix}
 \alias{as.data.table.list}
 \alias{as.data.table.data.frame}
@@ -22,12 +23,17 @@ as.data.table(x, keep.rownames=FALSE, \dots)
 
 \method{as.data.table}{data.table}(x, \dots)
 
+\method{as.data.table}{array}(x, keep.rownames=FALSE, sorted=TRUE, value.name="value", na.rm=TRUE, \dots)
+
 is.data.table(x)
 
 }
 \arguments{
   \item{x}{An R object.}
   \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.}
+  \item{sorted}{logical used in \emph{array} method, default \code{TRUE}.}
+  \item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.}
+  \item{na.rm}{logical used in \emph{array} method, default \code{TRUE} will remove rows with \code{NA} values.}
   \item{\dots}{Additional arguments to be passed to or from other methods.}
 }
 \details{
@@ -74,6 +80,11 @@ as.data.table(df, keep.rownames="rownames")
 
 dt = data.table(x=rep(c("x","y","z"),each=2), y=c(1:6))
 as.data.table(dt)
+
+ar = rnorm(27)
+ar[sample(27, 15)] = NA
+dim(ar) = c(3L,3L,3L)
+as.data.table(ar)
 }
 \keyword{ data }