Skip to content

Commit

Permalink
as.data.table.array method, closes #1418
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Dec 4, 2016
1 parent 6706882 commit 0ecb60b
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ S3method("$<-", data.table)
S3method(print, data.table)
S3method(as.data.table, data.table)
S3method(as.data.table, data.frame)
S3method(as.data.table, array)
S3method(as.data.table, matrix)
S3method(as.data.table, list)
S3method(as.data.table, integer)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

1. `indices()` function gain new argument `vectors` default `FALSE`, when `TRUE` provided then list of vectors is returned, single vector refers to single index. Closes #1589.

2. `as.data.table()` gains new method for `array`s, it will now return useful data.table. Closes #1418.

#### BUG FIXES

#### NOTES
Expand Down
31 changes: 31 additions & 0 deletions R/as.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,37 @@ as.data.table.matrix <- function(x, keep.rownames=FALSE, ...) {
alloc.col(value)
}

# as.data.table.array - #1418
as.data.table.array <- function(x, keep.rownames=FALSE, sorted=TRUE, value.name="value", na.rm=TRUE, ...) {
dx = dim(x)
if (length(dx) <= 2L)
stop("as.data.table.array method should be only called for arrays with 3+ dimensions, for 2 dimensions matrix method should be used")
if (!is.character(value.name) || length(value.name)!=1L || is.na(value.name) || !nzchar(value.name))
stop("Argument 'value.name' must be scalar character, non-NA and non zero char")
if (!is.logical(sorted) || length(sorted)!=1L || is.na(sorted))
stop("Argument 'sorted' must be scalar logical and non-NA")
if (!is.logical(na.rm) || length(na.rm)!=1L || is.na(na.rm))
stop("Argument 'na.rm' must be scalar logical and non-NA")

dnx = dimnames(x)
# NULL dimnames will create integer keys, not character as in table method
val = rev(if (is.null(dnx)) lapply(dim(x), seq.int) else dnx)
if (is.null(names(val)) || all(!nzchar(names(val))))
setattr(val, 'names', paste("V", rev(seq_along(val)), sep=""))
if (value.name %in% names(val))
stop(sprintf("Argument 'value.name' should not overlap with column names in result: %s.", paste(rev(names(val)), collapse=", ")))
N = NULL
ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N=as.vector(x))
if (isTRUE(na.rm))
ans = ans[!is.na(N)]
setnames(ans, "N", value.name)
dims = rev(head(names(ans), -1))
setcolorder(ans, c(dims, value.name))
if (isTRUE(sorted))
setkeyv(ans, dims)
ans[]
}

as.data.table.list <- function(x, keep.rownames=FALSE, ...) {
if (!length(x)) return( null.data.table() )
# fix for #833, as.data.table.list with matrix/data.frame/data.table as a list element..
Expand Down
107 changes: 107 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -9789,6 +9789,113 @@ indices(DT, vectors = TRUE)
test(1749.1, indices(DT), c("A__B","A","B"))
test(1749.2, indices(DT, vectors = TRUE), list(c("A","B"),"A","B"))

set.seed(1L)
ar.dimnames = list(color = sort(c("green","yellow","red")),
year = as.character(2011:2015),
status = sort(c("active","inactive","archived","removed")))
ar.dim = sapply(ar.dimnames, length)
ar = array(sample(c(rep(NA, 4), 4:7/2), prod(ar.dim), TRUE),
unname(ar.dim), # array() having length(dims) < 3 will be created as matrix in R so will not be dispatched here but as.data.table.matrix
ar.dimnames)
dt = as.data.table(ar, na.rm=FALSE)
dimcols = head(names(dt), -1L)
test(1750.1, TRUE, all(
nrow(dt) == 60L,
prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
dt[is.na(value), .N] == 30L,
dt[, .N==1L, c(dimcols)]$V1
))
dt = as.data.table(ar)
dimcols = head(names(dt), -1L)
test(1750.2, TRUE, all(
nrow(dt) == 30L,
prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
dt[is.na(value), .N] == 0L,
dt[, .N==1L, c(dimcols)]$V1
))
# 4D unnamed
x = array(1:81, dim=rep(3L,4))
dt = as.data.table(x, na.rm=FALSE)
test(1750.3, TRUE, all(
identical(dim(dt), c(81L,5L)),
identical(names(dt), c(paste0("V",1:4),"value")),
all(dt[J(1L)][1L, value] == 1L, dt[J(2L)][1L, value] == 2L, dt[J(3L)][.N, value] == 81L) # this also tests if dt is keyed
))
# 4D named dim values but not dims
x = array(1:81, dim=rep(3L, 4L), dimnames=rep(list(letters[1:3]), 4L))
dt = as.data.table(x, na.rm=FALSE)
test(1750.4, TRUE, all(
identical(dim(dt), c(81L,5L)),
identical(names(dt), c(paste0("V",1:4),"value")),
all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L)
))
# 4D named dim values and dims
x = array(1:81, dim=rep(3L, 4L), dimnames=setNames(rep(list(letters[1:3]), 4L), letters[1:4]))
dt = as.data.table(x, na.rm=FALSE)
test(1750.5, TRUE, all(
identical(dim(dt), c(81L,5L)),
identical(names(dt), c(letters[1:4],"value")),
all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L)
))
# third dim of length 1L so really 2D
x = array(1:4, dim=c(2L,2L,1L), dimnames=list(a=letters[1:2], b=letters[1:2], c="a"))
dt = as.data.table(x, na.rm=FALSE)
test(1750.6, TRUE, all(
identical(dim(dt), c(4L,4L)),
identical(names(dt), c("a","b","c","value")),
all(dt[J("a")][, value] == c(1L,3L), dt[J("b")][, value] == c(2L,4L))
))
# second and third dim of length 1L so really 1D
x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", c="a"))
dt = as.data.table(x, na.rm=FALSE)
test(1750.7, TRUE, all(
identical(dim(dt), c(2L,4L)),
identical(names(dt), c("a","b","c","value")),
all(dt[J("a")][, value] == 1L, dt[J("b")][, value] == 2L)
))
# 3x3x3 na.rm=FALSE / sorted=TRUE
set.seed(2)
x = rnorm(27)
x[sample(length(x), length(x)/2)] = NA
dim(x) = c(3L,3L,3L)
dt = as.data.table(x, na.rm=FALSE)
test(1750.8, TRUE, all(
identical(dim(dt), c(27L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
dt[is.na(value), .N] > 0L,
length(key(dt)) > 0L
))
# na.rm=TRUE / sorted=TRUE
dt = as.data.table(x)
test(1750.9, TRUE, all(
identical(dim(dt), c(14L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
dt[is.na(value), .N] == 0L,
length(key(dt)) > 0L
))
# na.rm=TRUE / sorted=FALSE
dt = as.data.table(x, sorted=FALSE)
test(1750.10, TRUE, all(
identical(dim(dt), c(14L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
dt[is.na(value), .N] == 0L,
is.unsorted(dt[[1]]),
is.null(key(dt))
))
# na.rm=FALSE / sorted=FALSE
dt = as.data.table(x, na.rm=FALSE, sorted=FALSE)
test(1750.11, TRUE, all(
identical(dim(dt), c(27L,4L)),
identical(names(dt), c(paste0("V",1:3),"value")),
is.unsorted(dt[[1]]),
is.null(key(dt))
))
# expects error on value.name overlapping with column names in result (dimension names)
x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", c="a"))
test(1750.12, as.data.table(x, value.name="a"), error = "Argument 'value.name' should not overlap with column names in result")
x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", value="a"))
test(1750.13, as.data.table(x), error = "Argument 'value.name' should not overlap with column names in result")


##########################

Expand Down
11 changes: 11 additions & 0 deletions man/as.data.table.Rd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
\name{as.data.table}
\alias{as.data.table}
\alias{as.data.table.array}
\alias{as.data.table.matrix}
\alias{as.data.table.list}
\alias{as.data.table.data.frame}
Expand All @@ -22,12 +23,17 @@ as.data.table(x, keep.rownames=FALSE, \dots)

\method{as.data.table}{data.table}(x, \dots)

\method{as.data.table}{array}(x, keep.rownames=FALSE, sorted=TRUE, value.name="value", na.rm=TRUE, \dots)

is.data.table(x)

}
\arguments{
\item{x}{An R object.}
\item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.}
\item{sorted}{logical used in \emph{array} method, default \code{TRUE}.}
\item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.}
\item{na.rm}{logical used in \emph{array} method, default \code{TRUE} will remove rows with \code{NA} values.}
\item{\dots}{Additional arguments to be passed to or from other methods.}
}
\details{
Expand Down Expand Up @@ -74,6 +80,11 @@ as.data.table(df, keep.rownames="rownames")
dt = data.table(x=rep(c("x","y","z"),each=2), y=c(1:6))
as.data.table(dt)
ar = rnorm(27)
ar[sample(27, 15)] = NA
dim(ar) = c(3L,3L,3L)
as.data.table(ar)
}
\keyword{ data }

0 comments on commit 0ecb60b

Please sign in to comment.