From 9d3109771f6a8caedc0643d1bd795c79722e8e8b Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Thu, 1 Mar 2018 12:05:39 +1100 Subject: [PATCH 1/8] Faster uniqueN(v) for logical v --- NEWS.md | 6 ++++-- R/duplicated.R | 13 ++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 29cfad750..d4ec6c713 100644 --- a/NEWS.md +++ b/NEWS.md @@ -65,10 +65,12 @@ 13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR. -14. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). +14. `uniqueN` is now faster on logical vectors. Thanks to Hugh Parsonage for the PR. + +15. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementation. -15. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. +16. `melt.data.table` now offers friendlier functionality for providing `value.name` for `list` input to `measure.vars`, [#1547](https://github.com/Rdatatable/data.table/issues/1547). Thanks @MichaelChirico and @franknarf1 for the suggestion and use cases, @jangorecki and @mrdwab for implementation feedback, and @MichaelChirico for ultimate implementation. #### BUG FIXES diff --git a/R/duplicated.R b/R/duplicated.R index 1f182870d..85da378ce 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -142,7 +142,18 @@ uniqueN <- function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) if (is.null(x)) return(0L) if (!is.atomic(x) && !is.data.frame(x)) stop("x must be an atomic vector or data.frames/data.tables") - if (is.atomic(x)) x = as_list(x) + if (is.atomic(x)) { + if (is.logical(x)) { + # NAs + TRUE + FALSE + if (na.rm) { + return(any(x, na.rm = TRUE) + !all(x, na.rm = TRUE)) + } else { + return(anyNA(x) + any(x, na.rm = TRUE) + !all(x, na.rm = TRUE)) + } + } else { + x = as_list(x) + } + } if (is.null(by)) by = seq_along(x) o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA) starts = attr(o, 'starts') From a324fb22c9f88013c2c9765eb1c3d39a7d9eccce Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Thu, 1 Mar 2018 12:47:44 +1100 Subject: [PATCH 2/8] Add tests for coverage --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 481a44492..7a7acb459 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6486,6 +6486,13 @@ DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6)) test(1475.1, uniqueN(DT), 10L) test(1475.2, DT[, .(uN=uniqueN(.SD)), by=A], data.table(A=1:3, uN=c(3L,4L,3L))) +# uniqueN logical branch +test(1475.3, x = uniqueN(c(NA, TRUE, FALSE)), y = 3L) +test(1475.4, x = uniqueN(c(NA, TRUE, FALSE), na.rm = TRUE), y = 2L) +test(1475.5, x = uniqueN(c(TRUE, FALSE), na.rm = TRUE), y = 2L) +test(1475.6, x = uniqueN(c(TRUE, FALSE)), y = 2L) + + # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) test(1476.1, DT[, .(y=mean(y)), x], setDT(aggregate(y ~ x, DT, mean))) From f51c5ca83da0e7f23f63226f47b646f0c6743b51 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 1 Mar 2018 01:15:57 -0800 Subject: [PATCH 3/8] Specialized uniqueNlogical in C. Single pass, returns early. --- R/duplicated.R | 12 ++---------- inst/tests/tests.Rraw | 21 +++++++++++++++------ src/data.table.h | 1 + src/init.c | 2 ++ src/uniqlist.c | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/R/duplicated.R b/R/duplicated.R index 85da378ce..6e56b5aba 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -143,16 +143,8 @@ uniqueN <- function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) if (!is.atomic(x) && !is.data.frame(x)) stop("x must be an atomic vector or data.frames/data.tables") if (is.atomic(x)) { - if (is.logical(x)) { - # NAs + TRUE + FALSE - if (na.rm) { - return(any(x, na.rm = TRUE) + !all(x, na.rm = TRUE)) - } else { - return(anyNA(x) + any(x, na.rm = TRUE) + !all(x, na.rm = TRUE)) - } - } else { - x = as_list(x) - } + if (is.logical(x)) return(.Call(CuniqueNlogical, x, na.rm=na.rm)) + x = as_list(x) } if (is.null(by)) by = seq_along(x) o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 38c91d36d..0325874d4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6486,12 +6486,21 @@ DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6)) test(1475.1, uniqueN(DT), 10L) test(1475.2, DT[, .(uN=uniqueN(.SD)), by=A], data.table(A=1:3, uN=c(3L,4L,3L))) -# uniqueN logical branch -test(1475.3, x = uniqueN(c(NA, TRUE, FALSE)), y = 3L) -test(1475.4, x = uniqueN(c(NA, TRUE, FALSE), na.rm = TRUE), y = 2L) -test(1475.5, x = uniqueN(c(TRUE, FALSE), na.rm = TRUE), y = 2L) -test(1475.6, x = uniqueN(c(TRUE, FALSE)), y = 2L) - +# specialized uniqueN for logical vectors, PR#2648 +test(1475.3, uniqueN(c(NA, TRUE, FALSE)), 3L) +test(1475.4, uniqueN(c(NA, TRUE, FALSE), na.rm = TRUE), 2L) +test(1475.5, uniqueN(c(TRUE, FALSE), na.rm = TRUE), 2L) +test(1475.6, uniqueN(c(TRUE, FALSE)), 2L) +test(1475.7, uniqueN(c(TRUE, NA)), 2L) +test(1475.8, uniqueN(c(TRUE, NA), na.rm=TRUE), 1L) +test(1475.9, uniqueN(c(FALSE, NA)), 2L) +test(1475.11, uniqueN(c(FALSE, NA), na.rm=TRUE), 1L) +test(1475.12, uniqueN(c(NA,NA)), 1L) +test(1475.13, uniqueN(c(NA,NA), na.rm=TRUE), 0L) +test(1475.14, uniqueN(NA), 1L) +test(1475.15, uniqueN(NA, na.rm=TRUE), 0L) +test(1475.16, uniqueN(logical()), 0L) +test(1475.17, uniqueN(logical(), na.rm=TRUE), 0L) # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) diff --git a/src/data.table.h b/src/data.table.h index 1779045b9..d10cdd332 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -4,6 +4,7 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); #include // for uint64_t rather than unsigned long long +#include #include "myomp.h" // data.table depends on R>=3.0.0 when R_xlen_t was introduced diff --git a/src/init.c b/src/init.c index 9f67ff3b7..9b5c5aa27 100644 --- a/src/init.c +++ b/src/init.c @@ -76,6 +76,7 @@ SEXP fsort(); SEXP inrange(); SEXP between(); SEXP hasOpenMP(); +SEXP uniqueNlogical(); // .Externals SEXP fastmean(); @@ -154,6 +155,7 @@ R_CallMethodDef callMethods[] = { {"Cinrange", (DL_FUNC) &inrange, -1}, {"Cbetween", (DL_FUNC) &between, -1}, {"ChasOpenMP", (DL_FUNC) &hasOpenMP, -1}, +{"CuniqueNlogical", (DL_FUNC) &uniqueNlogical, -1}, {NULL, NULL, 0} }; diff --git a/src/uniqlist.c b/src/uniqlist.c index 3958be99f..afdd6e37a 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -228,3 +228,35 @@ SEXP nestedid(SEXP l, SEXP cols, SEXP order, SEXP grps, SEXP resetvals, SEXP mul UNPROTECT(1); return(ans); } + +SEXP uniqueNlogical(SEXP x, SEXP narmArg) { + // single pass; short-circuit and return as soon as all 3 values are found + if (!isLogical(x)) error("x is not a logical vector"); + if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error("na.rm must be TRUE or FALSE"); + bool narm = LOGICAL(narmArg)[0]==1; + int n = LENGTH(x); + if (n==0) + return( ScalarInteger(0) ); // empty vector + Rboolean first = INTEGER(x)[0]; + int i=1; + while (i Date: Thu, 1 Mar 2018 11:24:43 -0800 Subject: [PATCH 4/8] Simpler uniqueNlogical. Should be zero speed difference in any case, just simpler code. --- src/uniqlist.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/uniqlist.c b/src/uniqlist.c index afdd6e37a..c047ee0d7 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -236,27 +236,19 @@ SEXP uniqueNlogical(SEXP x, SEXP narmArg) { bool narm = LOGICAL(narmArg)[0]==1; int n = LENGTH(x); if (n==0) - return( ScalarInteger(0) ); // empty vector + return ScalarInteger(0); // empty vector Rboolean first = INTEGER(x)[0]; int i=1; while (i Date: Thu, 1 Mar 2018 11:54:41 -0800 Subject: [PATCH 5/8] Remove one single superfluous comparison: when the loop looking for third started. No speed difference, just tidying. --- src/uniqlist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/uniqlist.c b/src/uniqlist.c index c047ee0d7..2a208c008 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -238,8 +238,8 @@ SEXP uniqueNlogical(SEXP x, SEXP narmArg) { if (n==0) return ScalarInteger(0); // empty vector Rboolean first = INTEGER(x)[0]; - int i=1; - while (i Date: Thu, 1 Mar 2018 17:18:00 -0800 Subject: [PATCH 6/8] Attempt to help compiler with const and == --- src/uniqlist.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/uniqlist.c b/src/uniqlist.c index 2a208c008..1179d9e88 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -234,21 +234,22 @@ SEXP uniqueNlogical(SEXP x, SEXP narmArg) { if (!isLogical(x)) error("x is not a logical vector"); if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error("na.rm must be TRUE or FALSE"); bool narm = LOGICAL(narmArg)[0]==1; - int n = LENGTH(x); + const int n = LENGTH(x); if (n==0) return ScalarInteger(0); // empty vector - Rboolean first = INTEGER(x)[0]; + Rboolean first = LOGICAL(x)[0]; int i=0; - while (++i Date: Thu, 1 Mar 2018 18:55:00 -0800 Subject: [PATCH 7/8] Long vector support. Maybe using R_xlen_t just like base R helps with alignment. --- src/uniqlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uniqlist.c b/src/uniqlist.c index 1179d9e88..478e04776 100644 --- a/src/uniqlist.c +++ b/src/uniqlist.c @@ -234,11 +234,11 @@ SEXP uniqueNlogical(SEXP x, SEXP narmArg) { if (!isLogical(x)) error("x is not a logical vector"); if (!isLogical(narmArg) || length(narmArg)!=1 || INTEGER(narmArg)[0]==NA_INTEGER) error("na.rm must be TRUE or FALSE"); bool narm = LOGICAL(narmArg)[0]==1; - const int n = LENGTH(x); + const R_xlen_t n = xlength(x); if (n==0) return ScalarInteger(0); // empty vector Rboolean first = LOGICAL(x)[0]; - int i=0; + R_xlen_t i=0; while (++i Date: Thu, 1 Mar 2018 22:41:15 -0800 Subject: [PATCH 8/8] Added timings to NEWS --- NEWS.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d4ec6c713..5e47f7cd6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -65,7 +65,17 @@ 13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR. -14. `uniqueN` is now faster on logical vectors. Thanks to Hugh Parsonage for the PR. +14. `uniqueN()` is now faster on logical vectors. Thanks to Hugh Parsonage for [PR#2648](https://github.com/Rdatatable/data.table/pull/2648). + ``` + N = 1e9 + was now + x = c(TRUE,FALSE,NA,rep(TRUE,N)) + uniqueN(x) == 3 5.4s 0.00s + x = c(TRUE,rep(FALSE,N), NA) + uniqueN(x,na.rm=TRUE) == 2 5.4s 0.00s + x = c(rep(TRUE,N),FALSE,NA) + uniqueN(x) == 3 6.7s 0.38s + ``` 15. Subsetting optimization with keys and indices is now possible for compound queries like `DT[a==1 & b==2]`, [#2472](https://github.com/Rdatatable/data.table/issues/2472). Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementation.