From a39897265a84d046c079c8b308c840c86e78ba6a Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Tue, 18 Oct 2016 18:58:10 -0700
Subject: [PATCH] Faster keyby (e.g. 25s down to 13s). Passes all tests with no
 changes. #1880

---
 NEWS.md        |  2 ++
 R/data.table.R | 23 ++++++++++++++---------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 21f60b8d9..095cf2040 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -94,6 +94,8 @@
   28. New argument `print.class` for `print.data.table` allows for including column class under column names (as inspired by `tbl_df` in `dplyr`); default (adjustable via `"datatable.print.class"` option) is `FALSE`, the inherited behavior. Part of [#1523](https://github.com/Rdatatable/data.table/issues/1523); thanks to @MichaelChirico for the FR & PR.
   
   29. `all.equal.data.table` gains `check.attributes`, `ignore.col.order`, `ignore.row.order` and `tolerance` arguments.
+  
+  30. `keyby=` is now much faster by not doing not needed work; e.g. 25s down to 13s for a 1.5GB DT with 200m rows and 86m groups. With more groups or bigger data, larger speedup factors are possible. Please always use `keyby=` unless you really need `by=`. `by=` returns the groups in first appearance order and takes longer to do that. See [#1880](https://github.com/Rdatatable/data.table/issues/1880) for more info and please register your views there on changing the default.
 
 #### BUG FIXES
 
diff --git a/R/data.table.R b/R/data.table.R
index ad0eb7757..9d3730847 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1542,7 +1542,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
         if (length(byval) && length(byval[[1]])) {
             if (!bysameorder) {
                 if (verbose) {last.started.at=proc.time()[3];cat("Finding groups using forderv ... ");flush.console()}
-                o__ = forderv(byval, sort=FALSE, retGrp=TRUE)   # returns integer() (not NULL) if already ordered, to save 1:xnrow for efficiency
+                o__ = forderv(byval, sort=!missing(keyby), retGrp=TRUE)
+                # The sort= argument is called sortStr at C level. It's just about saving the sort of unique strings at
+                # C level for efficiency (cgroup vs csort) when by= not keyby=. All other types are always sorted. Getting 
+                # orginal order below is the part that retains original order. Passing sort=TRUE here always won't change any 
+                # result at all (tested and confirmed to double check), it'll just make by= slower when there's a large
+                # number of unique strings. It must be TRUE when keyby= though, since the key is just marked afterwards.
+                # forderv() returns empty integer() if already ordered to save allocating 1:xnrow
                 bysameorder = orderedirows && !length(o__)
                 if (verbose) {
                     cat(round(proc.time()[3]-last.started.at, 3), "sec\n")
@@ -1553,7 +1559,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                 f__ = attr(o__, "starts")
                 len__ = uniqlengths(f__, xnrow)
                 if (verbose) { cat(round(proc.time()[3]-last.started.at, 3), "sec\n");flush.console()}
-                if (!bysameorder) {    # TO DO: lower this into forder.c
+                if (!bysameorder && missing(keyby)) {
+                    # TO DO: lower this into forder.c
                     if (verbose) {last.started.at=proc.time()[3];cat("Getting back original order ... ");flush.console()}
                     firstofeachgroup = o__[f__]    
                     if (length(origorder <- forderv(firstofeachgroup))) {
@@ -1562,7 +1569,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                     }
                     if (verbose) {cat(round(proc.time()[3]-last.started.at, 3), "sec\n")}
                 }
-                if (!orderedirows && !length(o__)) o__ = 1:xnrow  # temp fix.  TO DO: revist orderedirows
+                if (!orderedirows && !length(o__)) o__ = 1:xnrow  # temp fix.  TODO: revist orderedirows
             } else {
                 if (verbose) {last.started.at=proc.time()[3];cat("Finding groups using uniqlist ... ");flush.console()}
                 f__ = uniqlist(byval)
@@ -1898,7 +1905,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
         if (!missing(keyby)) {
             cnames = as.character(bysubl)[-1]
             if (all(cnames %chin% names(x))) {
-                if (verbose) {last.started.at=proc.time()[3];cat("setkey() afterwards ... ");flush.console()}                    
+                if (verbose) {last.started.at=proc.time()[3];cat("setkey() after the := with keyby= ... ");flush.console()}
                 setkeyv(x,cnames)  # TO DO: setkey before grouping to get memcpy benefit.
                 if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console()}
             }
@@ -1924,13 +1931,11 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     } else {
         setnames(ans,seq_along(bynames),bynames)   # TO DO: reinvestigate bynames flowing from dogroups here and simplify
     }
-    if (!missing(keyby)) {
-        if (verbose) {last.started.at=proc.time()[3];cat("setkey() afterwards ... ");flush.console()}
+    if (byjoin && !missing(keyby) && !bysameorder) {
+        if (verbose) {last.started.at=proc.time()[3];cat("setkey() afterwards for keyby=.EACHI ... ");flush.console()}
         setkeyv(ans,names(ans)[seq_along(byval)])
         if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console()}
-        # but if 'bykey' and 'bysameorder' then the setattr in branch above will run instead for
-        # speed (because !missing(by) when bykey, too)
-    } else if (haskey(x) && bysameorder) {
+    } else if (!missing(keyby) || (haskey(x) && bysameorder)) {
         setattr(ans,"sorted",names(ans)[seq_along(grpcols)])
     }
     alloc.col(ans)   # TO DO: overallocate in dogroups in the first place and remove this line