R/setops.R

# For internal use only (input symbol requirement is not checked)
#   cols [symbol] - columns provided to function argument
#   dt   [symbol] - a data.table
# Iff all of 'cols' is present in 'x' return col indices
# is.data.table(dt) check should be performed in the calling function
validate <- function(cols, dt) {
  argcols = deparse(substitute(cols))
  argdt = deparse(substitute(dt))
  origcols = cols
  if (is.character(cols)) cols = chmatch(cols, names(dt))
  cols = as.integer(cols)
  isna = which(!cols %in% seq_along(dt))
  if (length(isna))
    stop(argcols, " value(s) [", paste(origcols[isna], collapse=", "), "] not present (or out of range) in ", argdt)
  cols
}

# setdiff for data.tables, internal at the moment #547, used in not-join
setdiff_ <- function(x, y, by.x=seq_along(x), by.y=seq_along(y), use.names=FALSE) {
  if (!is.data.table(x) || !is.data.table(y)) stop("x and y must both be data.tables")
  if (is.null(x) || !length(x)) return(x)
  by.x = validate(by.x, x)
  if (is.null(y) || !length(y)) return(unique(x, by=by.x))
  by.y = validate(by.y, y)
  if (length(by.x) != length(by.y)) stop("length(by.x) != length(by.y)")
  # factor in x should've factor/character in y, and viceversa
  for (a in seq_along(by.x)) {
    lc = by.y[a]
    rc = by.x[a]
    icnam = names(y)[lc]
    xcnam = names(x)[rc]
    if ( is.character(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) {
      stop("When x's column ('",xcnam,"') is character, the corresponding column in y ('",icnam,"') should be factor or character, but found incompatible type '",typeof(y[[lc]]),"'.")
    } else if ( is.factor(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) {
      stop("When x's column ('",xcnam,"') is factor, the corresponding column in y ('",icnam,"') should be character or factor, but found incompatible type '",typeof(y[[lc]]),"'.")
    } else if ( (is.integer(x[[rc]]) || is.double(x[[rc]])) && (is.logical(y[[lc]]) || is.character(y[[lc]])) ) {
      stop("When x's column ('",xcnam,"') is integer or numeric, the corresponding column in y ('",icnam,"') can not be character or logical types, but found incompatible type '",typeof(y[[lc]]),"'.")
    }
  }
  ux = unique(shallow(x, by.x))
  uy = unique(shallow(y, by.y))
  ix = duplicated(rbind(uy, ux, use.names=use.names, fill=FALSE))[-seq_len(nrow(uy))]
  .Call(CsubsetDT, ux, which_(ix, FALSE), seq_along(ux)) # more memory efficient version of which(!ix)
}

# set operators ----

funique <- function(x) {
  stopifnot(is.data.table(x))
  dup = duplicated(x)
  if (any(dup)) .Call(CsubsetDT, x, which_(dup, FALSE), seq_along(x)) else x
}

fintersect <- function(x, y, all=FALSE) {
  if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one")
  if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
  if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
  if (!identical(names(x), names(y))) stop("x and y must have same column order")
  bad.type = setNames(c("raw","complex","list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex","list"))
  if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
  if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
  if (".seqn" %in% names(x)) stop("None of the datasets to intersect should contain a column named '.seqn'")
  if (!nrow(x) || !nrow(y)) return(x[0L])
  if (all) {
    x = shallow(x)[, ".seqn" := rowidv(x)]
    y = shallow(y)[, ".seqn" := rowidv(y)]
    jn.on = c(".seqn",setdiff(names(x),".seqn"))
    x[y, .SD, .SDcols=setdiff(names(x),".seqn"), nomatch=0L, on=jn.on]
  } else {
    x[funique(y), nomatch=0L, on=names(x), mult="first"]
  }
}

fsetdiff <- function(x, y, all=FALSE) {
  if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one")
  if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
  if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
  if (!identical(names(x), names(y))) stop("x and y must have same column order")
  bad.type = setNames(c("raw","complex","list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex","list"))
  if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
  if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
  if (".seqn" %in% names(x)) stop("None of the datasets to setdiff should contain a column named '.seqn'")
  if (!nrow(x)) return(x)
  if (!nrow(y)) return(if (!all) funique(x) else x)
  if (all) {
    x = shallow(x)[, ".seqn" := rowidv(x)]
    y = shallow(y)[, ".seqn" := rowidv(y)]
    jn.on = c(".seqn",setdiff(names(x),".seqn"))
    x[!y, .SD, .SDcols=setdiff(names(x),".seqn"), on=jn.on]
  } else {
    funique(x[!y, on=names(x)])
  }
}

funion <- function(x, y, all=FALSE) {
  if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one")
  if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
  if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
  if (!identical(names(x), names(y))) stop("x and y must have same column order")
  bad.type = setNames(c("raw","complex", if(!all) "list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex", if(!all) "list"))
  if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
  if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
  ans = rbindlist(list(x, y))
  if (!all) ans = funique(ans)
  ans
}

fsetequal <- function(x, y) {
  if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
  if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
  if (!identical(names(x), names(y))) stop("x and y must have same column order")
  bad.type = setNames(c("raw","complex","list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex","list"))
  if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
  if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
  isTRUE(all.equal.data.table(x, y, check.attributes = FALSE, ignore.row.order = TRUE))
}

# all.equal ----

all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attributes=TRUE, ignore.col.order=FALSE, ignore.row.order=FALSE, tolerance=sqrt(.Machine$double.eps), ...) {
  stopifnot(is.logical(trim.levels), is.logical(check.attributes), is.logical(ignore.col.order), is.logical(ignore.row.order), is.numeric(tolerance))
  if (!is.data.table(target) || !is.data.table(current)) stop("'target' and 'current' must be both data.tables")

  msg = character(0L)
  # init checks that detect high level all.equal
  if (nrow(current) != nrow(target)) msg = "Different number of rows"
  if (ncol(current) != ncol(target)) msg = c(msg, "Different number of columns")
  diff.colnames = !identical(sort(names(target)), sort(names(current)))
  diff.colorder = !identical(names(target), names(current))
  if (check.attributes && diff.colnames) msg = c(msg, "Different column names")
  if (!diff.colnames && !ignore.col.order && diff.colorder) msg = c(msg, "Different column order")

  if (length(msg)) return(msg) # skip check.attributes and further heavy processing

  # ignore.col.order
  if (ignore.col.order && diff.colorder) current = setcolorder(shallow(current), names(target))

  # Always check modes equal, like base::all.equal
  targetModes = vapply_1c(target, mode)
  currentModes = vapply_1c(current,  mode)
  if (any( d<-(targetModes!=currentModes) )) {
    w = head(which(d),3L)
    return(paste0("Datasets have different column modes. First 3: ",paste(
     paste0(names(targetModes)[w],"(",paste(targetModes[w],currentModes[w],sep="!="),")")
            ,collapse=" ")))
  }

  if (check.attributes) {
    squashClass = function(x) if (is.object(x)) paste(class(x),collapse=";") else mode(x)
    # else mode() is so that integer==numeric, like base all.equal does.
    targetTypes = vapply_1c(target, squashClass)
    currentTypes = vapply_1c(current, squashClass)
    if (length(targetTypes) != length(currentTypes))
      stop("Internal error: ncol(current)==ncol(target) was checked above")
    if (any( d<-(targetTypes != currentTypes))) {
      w = head(which(d),3L)
      return(paste0("Datasets have different column classes. First 3: ",paste(
     paste0(names(targetTypes)[w],"(",paste(targetTypes[w],currentTypes[w],sep="!="),")")
            ,collapse=" ")))
    }
  }

  if (check.attributes) {
    # check key
    k1 = key(target)
    k2 = key(current)
    if (!identical(k1, k2)) {
      return(sprintf("Datasets has different keys. 'target'%s. 'current'%s.",
               if(length(k1)) paste0(": ", paste(k1, collapse=", ")) else " has no key",
               if(length(k2)) paste0(": ", paste(k2, collapse=", ")) else " has no key"))
    }
    # check index
    i1 = indices(target)
    i2 = indices(current)
    if (!identical(i1, i2)) {
      return(sprintf("Datasets has different indexes. 'target'%s. 'current'%s.",
               if(length(i1)) paste0(": ", paste(i1, collapse=", ")) else " has no index",
               if(length(i2)) paste0(": ", paste(i2, collapse=", ")) else " has no index"))
    }

    # Trim any extra row.names attributes that came from some inheritence
    # Trim ".internal.selfref" as long as there is no `all.equal.externalptr` method
    exclude.attrs = function(x, attrs = c("row.names",".internal.selfref")) x[!names(x) %in% attrs]
    a1 = exclude.attrs(attributes(target))
    a2 = exclude.attrs(attributes(current))
    if (length(a1) != length(a2)) return(sprintf("Datasets has different number of (non-excluded) attributes: target %s, current %s", length(a1), length(a2)))
    if (!identical(nm1 <- sort(names(a1)), nm2 <- sort(names(a2)))) return(sprintf("Datasets has attributes with different names: %s", paste(setdiff(union(names(a1), names(a2)), intersect(names(a1), names(a2))), collapse=", ")))
    attrs.r = all.equal(a1[nm1], a2[nm2], ..., check.attributes = check.attributes)
    if (is.character(attrs.r)) return(paste("Attributes: <", attrs.r, ">")) # skip further heavy processing
  }

  if (ignore.row.order) {
    if (".seqn" %in% names(target))
      stop("None of the datasets to compare should contain a column named '.seqn'")
    bad.type = setNames(c("raw","complex","list") %chin% c(vapply(current, typeof, FUN.VALUE = ""), vapply(target, typeof, FUN.VALUE = "")), c("raw","complex","list"))
    if (any(bad.type))
      stop(sprintf("Datasets to compare with 'ignore.row.order' must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
    if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) {
      warning(sprintf("Argument 'tolerance' was forced to lowest accepted value `sqrt(.Machine$double.eps)` from provided %s", format(tolerance, scientific=FALSE)))
      tolerance = sqrt(.Machine$double.eps)
    }
    target_dup = as.logical(anyDuplicated(target))
    current_dup = as.logical(anyDuplicated(current))
    tolerance.msg = if (identical(tolerance, 0)) ", be aware you are using `tolerance=0` which may result into visually equal data" else ""
    if (target_dup || current_dup) {
      # handling 'tolerance' for duplicate rows - those `msg` will be returned only when equality with tolerance will fail
      if (any(vapply_1c(target,typeof)=="double") && !identical(tolerance, 0)) {
        if (target_dup && !current_dup) msg = c(msg, "Dataset 'target' has duplicate rows while 'current' doesn't")
        else if (!target_dup && current_dup) msg = c(msg, "Dataset 'current' has duplicate rows while 'target' doesn't")
        else { # both
          if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error
            stop("Duplicate rows in datasets, numeric columns and ignore.row.order cannot be used with non 0 tolerance argument")
          msg = c(msg, "Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0")
          tolerance = 0
        }
      } else { # no numeric columns or tolerance==0L
        if (target_dup && !current_dup)
          return(sprintf("Dataset 'target' has duplicate rows while 'current' doesn't%s", tolerance.msg))
        if (!target_dup && current_dup)
          return(sprintf("Dataset 'current' has duplicate rows while 'target' doesn't%s", tolerance.msg))
      }
    }
    jn.on = if (target_dup && current_dup) {
      target = shallow(target)[, ".seqn" := rowidv(target)]
      current = shallow(current)[, ".seqn" := rowidv(current)]
      c(".seqn", setdiff(names(target), ".seqn"))
    } else names(target)
    # handling 'tolerance' for factor cols - those `msg` will be returned only when equality with tolerance will fail
    if (any(vapply_1b(target,is.factor)) && !identical(tolerance, 0)) {
      if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error
        stop("Factor columns and ignore.row.order cannot be used with non 0 tolerance argument")
      msg = c(msg, "Using factor columns together together with ignore.row.order, this force 'tolerance' argument to 0")
      tolerance = 0
    }
    # roll join to support 'tolerance' argument, conditional to retain support for factor when tolerance=0
    ans = if (identical(tolerance, 0)) target[current, nomatch=NA, which=TRUE, on=jn.on] else {
      ans1 = target[current, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
      ans2 = target[current, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on]
      pmin(ans1, ans2, na.rm=TRUE)
    }
    if (any_na(as_list(ans))) {
      msg = c(msg, sprintf("Dataset 'current' has rows not present in 'target'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg))
      return(msg)
    }
    ans = if (identical(tolerance, 0)) current[target, nomatch=NA, which=TRUE, on=jn.on] else {
      ans1 = current[target, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
      ans2 = current[target, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on]
      pmin(ans1, ans2, na.rm=TRUE)
    }
    if (any_na(as_list(ans))) {
      msg = c(msg, sprintf("Dataset 'target' has rows not present in 'current'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg))
      return(msg)
    }
  } else {
    for (i in seq_along(target)) {
      # trim.levels moved here
      x = target[[i]]
      y = current[[i]]
      if (xor(is.factor(x),is.factor(y)))
        return("Internal error: factor type mismatch should have been caught earlier")
      cols.r = TRUE
      if (is.factor(x)) {
        if (!identical(levels(x),levels(y))) {
          if (trim.levels) {
            # do this regardless of check.attributes (that's more about classes, checked above)
            x = factor(x)
            y = factor(y)
            if (!identical(levels(x),levels(y)))
            cols.r = "Levels not identical even after refactoring since trim.levels is TRUE"
          } else {
            cols.r = "Levels not identical. No attempt to refactor because trim.levels is FALSE"
          }
        } else {
          cols.r = all.equal(x, y, check.attributes=check.attributes)
          # the check.attributes here refers to everything other than the levels, which are always
          # dealt with according to trim.levels
        }
      } else {
        cols.r = all.equal(unclass(x), unclass(y), tolerance=tolerance, ..., check.attributes=check.attributes)
        # classes were explicitly checked earlier above, so ignore classes here.
      }
      if (!isTRUE(cols.r)) return(paste0("Column '", names(target)[i], "': ", paste(cols.r,collapse=" ")))
    }
  }
  TRUE
}